1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/env python
"""
Convert to standard JSON format.

Convert the items extracted by Google Play and iTunes spiders written on Scrapy
to the standard format. Outputs in the current directory tableA.json for Google
Play movies, and tableB.json for iTunes Store movies.

Usage: convert.py <Google-Play-Scrapy-JSON> <iTunes-Scrapy-JSON>
"""

import re
import sys
import json
from collections import OrderedDict

def usage():
    """Print usage string.
    """
    print 'Usage: {0} <google-play-scrapy> <itunes-scrapy>'.format(sys.argv[0])

def google_play_rating(rating):
    """Compute average rating from number of ratings for each star.

    Args:
        rating: Ratings array, where i^th entry denotes the number of
            (i+1)-start ratings.

    Returns:
        Average rating of the movie.
    """
    d = 0
    n = 0
    for i, r in zip(xrange(len(rating)), rating):
        d += (i+1)*r
        n += r
    if n == 0: return ''
    return '{0:.2f}'.format(float(d)/n)

def get_price(text):
    """Parse price float value from string input.

    Args:
        text: Price string of the form: \$(\d+).?\d*

    Returns:
        Float value of the price string.

    >>> abs(get_price('$14.00') - 14.) < 1e-8
    True
    >>> abs(get_price('$14') - 14.) < 1e-8
    True
    >>> abs(get_price('ab') - 0.) < 1e-8
    True
    """
    m = re.match('\$([0-9.]+)', text)
    if m:
        return float(m.group(1))
    else:
        return 0.

def google_play_price(offers):
    """Extract useful information from offers.

    Args:
        offers: List of 2-entry lists, where the 0-th entry denotes the price of
            accessing the movie using method indicated in 1-st entry.
            Ex: [['$9', 'SD for rent'], ['$12', 'HD for rent']]

    Returns:
        2-tuple continue the best offer price and offer type. (Best here is most
        expensive.)
    """
    max_price = 0
    max_offer = ''
    max_price_txt = ''
    others = []
    for offer, price in offers:
        value = get_price(price)
        if max_price < value:
            max_price = value
            max_price_txt = price
            max_offer = offer
    return max_price_txt, max_offer


def write_google_play(path):
    """Convert Scrapy Google Play items to required JSON format.

    Reads the JSON Scrapy item generated by Scrapy crawling, and writes the
    table (tableA.json) in the required JSON format.

    Args:
        path: Path to the JSON file containing the Scrapy Google Play items.
    """
    play_dict = OrderedDict()
    attributes = [('id', 'TEXT'), ('name', 'TEXT'), ('year', 'INTEGER'),
                  ('month', 'TEXT'), ('genre', 'TEXT'), ('description', 'TEXT'),
                  ('actors', 'TEXT'), ('writers', 'TEXT'),
                  ('producers', 'TEXT'), ('directors', 'TEXT'),
                  ('content_rating', 'TEXT'), ('rating', 'TEXT'),
                  ('price', 'TEXT'), ('offer_type', 'TEXT'),
                  ('all_offers', 'TEXT'),
                  ('similar_movies_id', 'TEXT')]
    attributes_dict = [{'name': n, 'type': t} for n, t in attributes]
    table_dict = OrderedDict()
    table_dict['name'] = 'google-play'
    table_dict['description'] = 'Movies from Google Play'
    table_dict['idAttrib'] = { 'name': 'id', 'type': 'TEXT', }
    table_dict['attributes'] = attributes_dict
    play_dict['table'] = table_dict
    tuples = []
    items = json.load(open(path, 'r'))
    for item in items:
        item_dict = OrderedDict()
        item_dict['id'] = item['id'][0] if item.has_key('id') else ''
        item_dict['name'] = item['name'][0] if item.has_key('name') else ''
        item_dict['year'] = int(item['year'][0]) \
                if item.has_key('year') else ''
        item_dict['month'] = item['month'][0] if item.has_key('month') else ''
        item_dict['genre'] = item['genre'][0] if item.has_key('genre') else ''
        item_dict['description'] = item['synopsis'][0].strip() \
                if item.has_key('synopsis') else ''
        item_dict['actors'] = ', '.join(item['actors']).strip() \
                if item.has_key('actors') else ''
        item_dict['writers'] = ', '.join(item['writers']).strip() \
                if item.has_key('writers') else ''
        item_dict['producers'] = ', '.join(item['producers']).strip() \
                if item.has_key('producers') else ''
        item_dict['directors'] = ', '.join(item['directors']).strip() \
                if item.has_key('directors') else ''
        item_dict['content_rating'] = item['content_rating'][0] \
                if item.has_key('content_rating') else ''
        item_dict['rating'] = google_play_rating(item['rating']) \
                if item.has_key('rating') else ''
        item_dict['price'], item_dict['offer_type'] = \
                google_play_price(item['offers']) \
                if item.has_key('offers') else ('', '')
        item_dict['all_offers'] = ', '.join(
                ['{0} at {1}'.format(o[0], o[1]) for o in item['offers']]) \
                        if item.has_key('offers') else ''
        item_dict['similar_movies_id'] = ', '.join(item['similar_movies_id']) \
                if item.has_key('similar_movies_id') else ''
        tuples.append(item_dict)
    play_dict['tuples'] = tuples
    with open('tableA.json', 'w') as wfile:
        json.dump(play_dict, wfile, indent=True)

def write_itunes(path):
    """Convert Scrapy iTunes items to required JSON format.

    Reads the JSON Scrapy item generated by Scrapy crawling, and writes the
    table (tableB.json) in the required JSON format.

    Args:
        path: Path to the JSON file containing the Scrapy iTunes items.
    """
    itunes_dict = OrderedDict()
    attributes = [('id', 'TEXT'), ('name', 'TEXT'), ('year', 'INTEGER'),
                  ('genre', 'TEXT'), ('description', 'TEXT'), ('price', 'TEXT'),
                  ('actors', 'TEXT'), ('writers', 'TEXT'),
                  ('producers', 'TEXT'), ('directors', 'TEXT'),
                  ('content_rating', 'TEXT'), ('rating', 'TEXT'),
                  ('number_of_ratings', 'INTEGER'),
                  ('similar_movies_id', 'TEXT'),
                  ('rotten_tomatoes_tomatometer', 'TEXT'),
                  ('rotten_tomatoes_average_rating', 'TEXT')]
    attributes_dict = [{'name': n, 'type': t} for n, t in attributes]
    table_dict = OrderedDict()
    table_dict['name'] = 'itunes'
    table_dict['description'] = 'Movies from iTunes Store'
    table_dict['idAttrib'] = { 'name': 'id', 'type': 'TEXT', }
    table_dict['attributes'] = attributes_dict
    itunes_dict['table'] = table_dict
    tuples = []
    items = json.load(open(path, 'r'))
    for item in items:
        item_dict = OrderedDict()
        item_dict['id'] = item['id'][0] if item.has_key('id') else ''
        item_dict['name'] = item['name'][0] if item.has_key('name') else ''
        try:
            # For bad-data.
            item_dict['year'] = int(item['year'][0]) \
                    if item.has_key('year') else ''
        except ValueError:
            continue
        item_dict['genre'] = item['genre'][0] if item.has_key('genre') else ''
        item_dict['description'] = item['description'][0].strip() \
                if item.has_key('description') else ''
        item_dict['actors'] = ', '.join(item['actors']).strip() \
                if item.has_key('actors') else ''
        item_dict['writers'] = ', '.join(item['writers']).strip() \
                if item.has_key('writers') else ''
        item_dict['producers'] = ', '.join(item['producers']).strip() \
                if item.has_key('producers') else ''
        item_dict['directors'] = ', '.join(item['directors']).strip() \
                if item.has_key('directors') else ''
        item_dict['content_rating'] = item['content_rating'][0] \
                if item.has_key('content_rating') else ''
        item_dict['rating'] = '{0:.1f}'.format(item['stars'][0]) \
                if item.has_key('stars') else '0'
        item_dict['number_of_ratings'] = str(item['star_ratings'][0]) \
                if item.has_key('star_ratings') else '0'
        item_dict['similar_movies_id'] = ', '.join(item['similar_movies_id']) \
                if item.has_key('similar_movies_id') else ''
        item_dict['rotten_tomatoes_tomatometer'] = \
                item['rotten_tomatoes_tomatometer'][0] \
                if item.has_key('rotten_tomatoes_tomatometer') else ''
        item_dict['rotten_tomatoes_average_rating'] = \
                item['rotten_tomatoes_average_rating'][0] \
                if item.has_key('rotten_tomatoes_average_rating') else ''
        tuples.append(item_dict)
    itunes_dict['tuples'] = tuples
    with open('tableB.json', 'w') as wfile:
        json.dump(itunes_dict, wfile, indent=True)

def main():
    """Main entry point to the script.
    """
    if len(sys.argv) != 3:
        usage()
        sys.exit(1)
    write_google_play(sys.argv[1])
    write_itunes(sys.argv[2])

if __name__ == '__main__':
    main()