from bs4 import BeautifulSoup as bs
import json as sj
import sys
import gc
sys.setrecursionlimit(1500)
table = {
    "table":{
        "name":"Amazon",
                "description":"Books from Amazon",
                "idAttrib":{
                        "name":"isbn-13",
                        "type":"INT"
                },
                "attributes":[
                        {
                            "name":"authors",
                            "type":"TEXT"
                        },
                        {
                            "name":"date",
                            "type":"TEXT"
                        },
                        {
                            "name":"edition",
                            "type":"TEXT"
                        },
                        {
                            "name":"hardcover price",
                            "type":"TEXT"
                        },
                        {
                            "name":"isbn-10",
                            "type":"INT"
                        },
                        {
                            "name":"isbn-13",
                            "type":"TEXT"
                        },
                        {
                            "name":"language",
                            "type":"TEXT"
                        },
                        {
                            "name":"pages",
                            "type":"TEXT"
                        },
                        {
                            "name":"paperback price",
                            "type":"TEXT"
                        },
                        {
                            "name":"product dimensions",
                            "type":"TEXT"
                        },
                        {
                            "name":"publisher",
                            "type":"TEXT"
                        },
                        {
                            "name":"title",
                            "type":"TEXT"
                        },
                ],
                "tuple":[]
    }
}

outfile = open("amazon_full.json","w")

num_results = 3600

for j in range(1,1+num_results):
    print(j)
    book = {}
    filename = "RESULTS/results_"+str(j)+".html"
    try:
        f = open(filename, "r",encoding="utf8")
    except IOError as e:
        print("I/O error({0}): {1}".format(e.errno, e.strerror))
        continue
    soup = bs(f)
    soup.prettify()

    for elem in soup.findAll(['script']):
        elem.extract()

    span = soup.find('span', {'id' : 'productTitle'})
    if(span):
        book["title"] = span.text
    else:
        book["title"] = "na"

    authors = ""
    p1 = soup.find("div", id="booksTitle")
    if(p1):
        p2 = p1.findAll("span", class_="author notFaded")
        for author in p2:
            result = author.findAll('a', recursive=False)
            if len(result) == 1:
                authors = authors + result[0].contents[0].strip() + ';'
            else:
                result = author.findAll('span', recursive=False)
                authors = authors + result[0].a.contents[0].strip() + ';'
        book["authors"] = authors
    else:
        book["authors"] = "na"

    table1 = soup.find("table", id="productDetailsTable")
    if(table1):
        li = table1.ul.findAll('li',recursive=False)
        for k in li:
            key = k.b.contents[0].strip()
            if key[-1] == ':':
                key = key[:-1]
            if (key == "Publisher"):
                fullPublisherVal = k.b.nextSibling
                if fullPublisherVal.find(u';') == -1:
                    book['edition'] = "na"
                    book[key.lower()] = fullPublisherVal[:fullPublisherVal.find(u'(')-1]
                else:
                    book[key.lower()] = fullPublisherVal[:fullPublisherVal.find(u';')]
                    book['edition'] = fullPublisherVal[fullPublisherVal.find(u';')+1:fullPublisherVal.find(u'(')]
                book['date'] = fullPublisherVal[fullPublisherVal.find(u'(')+1:fullPublisherVal.find(u')')]
            else:
                if(key.lower() == 'isbn-10' or key.lower() == 'isbn-13' or key.lower() == 'language' or key.lower() == 'product dimensions'):
                    val = k.b.nextSibling
                    book[key.lower()] = val
                if k.b.nextSibling:
                    val = k.b.nextSibling
                    if "pages" in val:
                        book['pages'] = val

    if 'isbn-10' not in book:
        book['isbn-10'] = "na"
    if 'isbn-13' not in book:
        book['isbn-13'] = "na"
    if 'language' not in book:
        book['language'] = "na"
    if 'pages' not in book:
        book['pages'] = "na"
    if 'product dimensions' not in book:
        book['product dimensions'] = "na"
    if 'publisher' not in book:
        book['publisher'] = "na"
    if 'date' not in book:
        book['date'] = "na"
    if 'edition' not in book:
        book['edition'] = "na"
    if 'pages' not in book:
        book['pages'] = "na"

    for li in soup.findAll('li',{'class' : 'swatchElement unselected'}):
        for span in li.find_all('span', {'class' : 'a-button a-spacing-mini a-button-toggle format'}):
            x = span.text
            while '  ' in x:
                x = x.replace('  ', ' ')
            while '\n' in x:
                x = x.replace('\n','')
            while '\t' in x:
                x = x.replace('\t','')
    # Adds price table
    for item in soup.find_all(attrs={'class' : "a-normal a-spacing-none"}):
        for span in item.find_all(attrs={'class' : "a-size-small a-color-base"}):
            key = span.string
        for span in item.find_all(attrs={'class' : "a-size-small a-color-price"}):
            val = span.string
        if(key.lower() == 'hardcover'):
           book['hardcover price'] = val
        if(key.lower()== 'paperback'):
            book['paperback price'] = val

    if 'paperback price' not in book:
        book['paperback price'] = "na"
    if 'hardcover price' not in book:
        book['hardcover price'] = "na"

    table["table"]["tuple"].append(book)
    soup.decompose()
    gc.collect()
    f.close()

outfile.write((sj.dumps(table, indent = 4, sort_keys=True,separators=(',',':'))))
outfile.close()