1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236 | """
Google Play movies crawler written for Scrapy crawling framework.
This file implements a spider for crawler movies from Google Play. The crawler
works by fetching all the top-selling and new release movies for each genre
available in the Google Play movies site (https://play.google.com/store/movies).
Note that Google Play does not have an easy way to exhaustively list all the
movies available to stream.
Usage:
scrapy crawl google-play [options]
Examples:
# Crawl all movies in Play.
scrapy crawl google-play
# Crawl all movies in Play and write them crawl movie items to
# google-play.json in JSON format.
scrapy crawl google-play -o google-play.json -t json
# Test single movie items.
scrapy crawl google-play -a test=movie_item
# Test few movie listing pages.
scrapy crawl google-play -a test=listing
"""
import errno
import os
import urlparse
from scrapy.contrib.loader import XPathItemLoader
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from movies.items import GooglePlayMovieItem
DOMAIN = 'play.google.com'
SITE_DOMAIN = 'https://' + DOMAIN
SITE_PREFIX = SITE_DOMAIN + '/store/movies/'
SAMPLE_MOVIE = (
SITE_PREFIX + 'details/Mr_Peabody_and_Sherman?id=dPzYb6bv_wM',
SITE_PREFIX + 'details/Unforgiven?id=_1MiCX608Vg',
)
SAMPLE_LISTING = SITE_PREFIX + 'category/MOVIE/collection/topselling_paid?num=3'
def MkDir(path):
"""Create 'path' directory.
Create the directory (possibly recursively) and ignore if it already exists.
Args:
path: Path of the directory to create.
"""
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
def GetID(url):
"""Parse the Google Play movie id from the URL.
The Google Play movie id is present as a query string 'id=<ID>'
Args:
url: URL for the movie item.
"""
p = urlparse.urlparse(url)
try:
id = dict(urlparse.parse_qsl(p.query))['id']
except:
id = url
return id
class GooglePlaySpider(BaseSpider):
"""Google Play movies spider.
Attributes:
name: Name of the crawler.
allowed_domains: Allowed domains.
categories: List of genre-id in Google Play.
batch: Number of movie items to query at a time.
entries: Maximum number of entries to fetch from each genre listing.
(Googe Play only returns a maximum of 600 movie item.)
mode: Mode of operation of the spider. 'normal' crawls the entire site.
'test_movie_item' tests crawling a few movie items. 'test_listing'
tests crawling a few movie listing pages.
seen: Dictionary of movie-ids already crawled.
"""
name = 'google-play'
allowed_domains = [DOMAIN]
categories = range(1, 11) + [13, 15, 18, 25, 26, 27]
batch = 100
entries = 600
def __init__(self, *args, **kwargs):
"""Constructor.
Initializes Google Play spider with normal mode or test mode.
Args:
args: Ignored.
kwargs: If 'test' key is set, mode is changed to:
'test_' + kwargs['test']
"""
super(GooglePlaySpider, self).__init__(*args, **kwargs)
self.mode = 'normal'
self.seen = set()
if kwargs.has_key('test'):
self.mode = 'test_' + kwargs['test']
else:
self.mode = 'normal'
def start_requests(self):
"""Returns seed URLs to start the crawling.
Iterates through each genre and yields URLs for all the pages in the
top-selling movies and new releases.
Yields:
Request() objects for all movie listing pages, which are processed
by self.parse() method.
"""
if self.mode == 'test_movie_item':
for movie in SAMPLE_MOVIE:
yield Request(movie, callback=self.parse_item)
elif self.mode == 'test_listing':
yield Request(SAMPLE_LISTING, callback=self.parse)
else:
for cat in self.categories:
for start in xrange(0, self.entries, self.batch):
yield Request('{0}category/{1}/collection/topselling_paid?start={2}&num={3}'.format(
SITE_PREFIX, cat, start, self.batch), callback=self.parse)
yield Request('{0}category/{1}/collection/movers_shakers?start={2}&num={3}'.format(
SITE_PREFIX, cat, start, self.batch), callback=self.parse)
def parse(self, response):
"""Parse a movie listing page.
Args:
response: Response object for the movie listing URL. response.body
has the text of the response.
Yields:
Request() objects for all the movies listed in that page, which are
processed by self.parse_item() method.
"""
self.dump('listing', response.url, response.body)
hxs = HtmlXPathSelector(response)
for url in hxs.select('//div[@class="details"]/a/@href').extract():
movie_id = GetID(url)
if movie_id not in self.seen:
self.seen.add(movie_id)
yield Request(SITE_DOMAIN + url, callback=self.parse_item)
def parse_item(self, response):
"""Parse a movie page.
Args:
response: Response object for the movie page. response.body has the
text of the response.
Yields:
GooglePlayMovieItem() object with attributes parsed from the page.
"""
self.dump('item', response.url, response.body)
hxs = HtmlXPathSelector(response)
d = hxs.select('//div[@class="details-info"]')[0]
loader = XPathItemLoader(item=GooglePlayMovieItem(), selector=d)
loader.add_xpath('name', '//div[@itemprop="name"]/div/text()')
loader.add_value('id', GetID(response.url))
_pub = d.select('//div[@itemprop="datePublished"]/text()').extract()[0].strip().split()
loader.add_value('month', _pub[0])
loader.add_value('year', _pub[1])
loader.add_xpath('genre', '//span[@itemprop="genre"]/text()')
loader.add_value('offers', self.parse_offers(d))
loader.add_xpath('synopsis',
'//div[contains(@class,"details-section description")]/meta[@itemprop="description"]/@content')
_ratings = map(lambda x: int(x.replace(',', '')),
d.select('//div[@class="rating-histogram"]//span[@class="bar-number"]/text()').extract())
_ratings.reverse()
loader.add_value('rating', _ratings)
loader.add_xpath('actors', '//span[@itemprop="actor"]/a/span/text()')
loader.add_xpath('producers', '//span[@itemprop="producer"]/a/span/text()')
loader.add_xpath('writers', '//span[@itemprop="author"]/a/span/text()')
loader.add_xpath('directors', '//span[@itemprop="director"]/a/span/text()')
loader.add_xpath('content_rating','//div[@itemprop="contentRating"]/text()')
loader.add_xpath('run_time', '//meta[@itemprop="duration"]/@content')
loader.add_xpath('similar_movies', '//div[@class="details-section recommendation"]//a[@class="title"]/@title')
loader.add_value('similar_movies_id',
map(GetID,
d.select('//div[@class="details-section recommendation"]//a[@class="title"]/@href').extract()))
return loader.load_item()
def parse_offers(self, div):
"""Parse the offers for the movie.
Args:
div: <div> containing the movie offers information.
Returns:
List of 2-entry lists each containing the price and type of the
offer.
"""
price = div.select('(//button[@class="price buy"])[1]//span[@itemprop="offers"]/meta[@itemprop="price"]/@content').extract()
desc = div.select('(//button[@class="price buy"])[1]//span[@itemprop="offers"]/meta[@itemprop="description"]/@content').extract()
return zip(desc, price)
def dump(self, type, url, body):
"""Dump the HTML file to disk.
Args:
type: Type of the URL.
url: URL of the page.
body: Contents of the URL.
"""
directory = 'data/{0}/{1}'.format(self.name, type)
MkDir(directory)
filename = '{0}/{1}'.format(directory, self.dump_filename(url))
with open(filename, 'w') as dump:
dump.write(body)
def dump_filename(self, url):
"""File to dump the HTML pages.
Args:
url: URL of the HTML page.
Returns:
Sane path from the URL replacesing '/' and '?' with '_'.
"""
url = url[len(SITE_DOMAIN)+1:].replace('/', '_').replace('?', '_')
return url + '.html'
|