from bs4 import BeautifulSoup as bs import json as sj import sys import gc sys.setrecursionlimit(1500) table = { "table":{ "name":"Amazon", "description":"Books from Amazon", "idAttrib":{ "name":"isbn-13", "type":"INT" }, "attributes":[ { "name":"authors", "type":"TEXT" }, { "name":"date", "type":"TEXT" }, { "name":"edition", "type":"TEXT" }, { "name":"hardcover price", "type":"TEXT" }, { "name":"isbn-10", "type":"INT" }, { "name":"isbn-13", "type":"TEXT" }, { "name":"language", "type":"TEXT" }, { "name":"pages", "type":"TEXT" }, { "name":"paperback price", "type":"TEXT" }, { "name":"product dimensions", "type":"TEXT" }, { "name":"publisher", "type":"TEXT" }, { "name":"title", "type":"TEXT" }, ], "tuple":[] } } outfile = open("amazon_full.json","w") num_results = 3600 for j in range(1,1+num_results): print(j) book = {} filename = "RESULTS/results_"+str(j)+".html" try: f = open(filename, "r",encoding="utf8") except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) continue soup = bs(f) soup.prettify() for elem in soup.findAll(['script']): elem.extract() span = soup.find('span', {'id' : 'productTitle'}) if(span): book["title"] = span.text else: book["title"] = "na" authors = "" p1 = soup.find("div", id="booksTitle") if(p1): p2 = p1.findAll("span", class_="author notFaded") for author in p2: result = author.findAll('a', recursive=False) if len(result) == 1: authors = authors + result[0].contents[0].strip() + ';' else: result = author.findAll('span', recursive=False) authors = authors + result[0].a.contents[0].strip() + ';' book["authors"] = authors else: book["authors"] = "na" table1 = soup.find("table", id="productDetailsTable") if(table1): li = table1.ul.findAll('li',recursive=False) for k in li: key = k.b.contents[0].strip() if key[-1] == ':': key = key[:-1] if (key == "Publisher"): fullPublisherVal = k.b.nextSibling if fullPublisherVal.find(u';') == -1: book['edition'] = "na" book[key.lower()] = fullPublisherVal[:fullPublisherVal.find(u'(')-1] else: book[key.lower()] = fullPublisherVal[:fullPublisherVal.find(u';')] book['edition'] = fullPublisherVal[fullPublisherVal.find(u';')+1:fullPublisherVal.find(u'(')] book['date'] = fullPublisherVal[fullPublisherVal.find(u'(')+1:fullPublisherVal.find(u')')] else: if(key.lower() == 'isbn-10' or key.lower() == 'isbn-13' or key.lower() == 'language' or key.lower() == 'product dimensions'): val = k.b.nextSibling book[key.lower()] = val if k.b.nextSibling: val = k.b.nextSibling if "pages" in val: book['pages'] = val if 'isbn-10' not in book: book['isbn-10'] = "na" if 'isbn-13' not in book: book['isbn-13'] = "na" if 'language' not in book: book['language'] = "na" if 'pages' not in book: book['pages'] = "na" if 'product dimensions' not in book: book['product dimensions'] = "na" if 'publisher' not in book: book['publisher'] = "na" if 'date' not in book: book['date'] = "na" if 'edition' not in book: book['edition'] = "na" if 'pages' not in book: book['pages'] = "na" for li in soup.findAll('li',{'class' : 'swatchElement unselected'}): for span in li.find_all('span', {'class' : 'a-button a-spacing-mini a-button-toggle format'}): x = span.text while ' ' in x: x = x.replace(' ', ' ') while '\n' in x: x = x.replace('\n','') while '\t' in x: x = x.replace('\t','') # Adds price table for item in soup.find_all(attrs={'class' : "a-normal a-spacing-none"}): for span in item.find_all(attrs={'class' : "a-size-small a-color-base"}): key = span.string for span in item.find_all(attrs={'class' : "a-size-small a-color-price"}): val = span.string if(key.lower() == 'hardcover'): book['hardcover price'] = val if(key.lower()== 'paperback'): book['paperback price'] = val if 'paperback price' not in book: book['paperback price'] = "na" if 'hardcover price' not in book: book['hardcover price'] = "na" table["table"]["tuple"].append(book) soup.decompose() gc.collect() f.close() outfile.write((sj.dumps(table, indent = 4, sort_keys=True,separators=(',',':')))) outfile.close()