from bs4 import BeautifulSoup as bs import sys import json sys.setrecursionlimit(1500) table = { "table" : { "name" : "Barnes&noble", "description" : "Books from Barnes and Noble", "idAttrib" : { "name" : "isbn-13", "type" : "INT" }, "attributes" : [ {"name" : "audiobook bn price", "type" : "TEXT"}, {"name" : "authors", "type" : "TEXT"}, {"name" : "edition", "type" : "INT"}, {"name" : "hardcover bn price", "type" : "TEXT"}, {"name" : "isbn-13", "type" : "INT"}, {"name" : "pages", "type" : "INT"}, {"name" : "paperback bn price", "type" : "TEXT"}, {"name" : "product dimensions", "type" : "TEXT"}, {"name" : "publication date", "type" : "TEXT"}, {"name" : "publisher", "type" : "TEXT"}, {"name" : "related categories", "type" : "TEXT"}, {"name" : "series", "type" : "TEXT"}, {"name" : "title", "type" : "TEXT"}, ], "tuples" : [] } } outfile = open("barnesandnoble.json","w") num_results = 3600 for j in range(1,num_results+1): print(j) book = {} filename = "RESULTS/results"+str(j)+".html" f = open(filename, "r", encoding='utf-8') soup = bs(f) soup.prettify() book_title = soup.find('meta', {'property' : 'og:title'})['content'] book['title'] = book_title div = soup.find('div', {'class':'product-details box'}) for litag in div.findAll('li'): x = litag.text while ' ' in x: x = x.replace(' ', ' ') while '\n' in x: x = x.replace('\n','') key,delim,val = x.partition(":") if(key == "Publication date"): book['publication date'] = val.strip() if(key == "ISBN-13"): book['isbn-13'] = val.strip() if(key == "Publisher"): book['publisher'] = val.strip() if(key == "Pages"): book['pages'] = val.strip() if(key == "Product dimensions"): book['product dimensions'] = val.strip() if(key == "Edition Number"): book['edition'] = val.strip() if(key == "Series"): book['series'] = val.strip() if 'edition' not in book: book['edition'] = "na" if 'series' not in book: book['series'] = "na" if 'pages' not in book: book['pages'] = "na" if 'product dimensions' not in book: book['product dimensions'] = "na" contributors = "" for ultag in soup.find_all('ul', class_ ="contributors"): for litag in ultag.find_all('li'): if litag.text != 'by': x = litag.text while ' ' in x: x = x.replace(' ', ' ') while '\n' in x: x = x.replace('\n','') contributors = x + "; " + contributors contributors = contributors.strip() contributors = contributors.rstrip(';') book['authors'] = contributors category = "" for ultag in soup.find_all('ul', class_ ="related-categories box"): for litag in ultag.find_all('li'): category = litag.text + "; " + category category = category.strip() category = category.rstrip(';') book['related categories'] = category # AUDIOBOOK for li_types in soup.findAll('li', class_="format ", attrs = {"data-bntrack" : "Audiobook"}): for div in li_types.find_all(attrs = {"data-bntrack" : "ParentBNPrice" } ): book['audiobook bn price'] = div.text if 'audiobook bn price' not in book: book['audiobook bn price'] = "na" # HARDCOVER for li_types in soup.findAll('li', class_="format ", attrs = {"data-bntrack" : "Hardcover"}): for div in li_types.find_all(attrs = {"data-bntrack": "ParentBNPrice"}): book['hardcover bn price'] = div.text if 'hardcover bn price' not in book: book['hardcover bn price'] = "na" # PAPERBACK for li_types in soup.findAll('li', class_="format ", attrs = {"data-bntrack" : "Paperback"}): for div in li_types.find_all(attrs={"data-bntrack": "ParentBNPrice"}): book['paperback bn price'] = div.text if 'paperback bn price' not in book: book['paperback bn price'] = "na" table["table"]["tuples"].append(book) f.close() outfile.write(json.dumps(table, indent=4,sort_keys=True,separators=(',',':'))) outfile.close()