"""
Google Play movies crawler written for Scrapy crawling framework.

This file implements a spider for crawler movies from Google Play. The crawler
works by fetching all the top-selling and new release movies for each genre
available in the Google Play movies site (https://play.google.com/store/movies).
Note that Google Play does not have an easy way to exhaustively list all the
movies available to stream.

Usage:
    scrapy crawl google-play [options]

Examples:
    # Crawl all movies in Play.
    scrapy crawl google-play

    # Crawl all movies in Play and write them crawl movie items to
    # google-play.json in JSON format.
    scrapy crawl google-play -o google-play.json -t json

    # Test single movie items.
    scrapy crawl google-play -a test=movie_item

    # Test few movie listing pages.
    scrapy crawl google-play -a test=listing
"""
import errno
import os
import urlparse

from scrapy.contrib.loader import XPathItemLoader
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider

from movies.items import GooglePlayMovieItem

DOMAIN = 'play.google.com'
SITE_DOMAIN = 'https://' + DOMAIN
SITE_PREFIX = SITE_DOMAIN + '/store/movies/'
SAMPLE_MOVIE = (
        SITE_PREFIX + 'details/Mr_Peabody_and_Sherman?id=dPzYb6bv_wM',
        SITE_PREFIX + 'details/Unforgiven?id=_1MiCX608Vg',
        )
SAMPLE_LISTING = SITE_PREFIX + 'category/MOVIE/collection/topselling_paid?num=3'

def MkDir(path):
    """Create 'path' directory.

    Create the directory (possibly recursively) and ignore if it already exists.

    Args:
        path: Path of the directory to create.
    """
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

def GetID(url):
    """Parse the Google Play movie id from the URL.

    The Google Play movie id is present as a query string 'id=<ID>'

    Args:
        url: URL for the movie item.
    """
    p = urlparse.urlparse(url)
    try:
        id = dict(urlparse.parse_qsl(p.query))['id']
    except:
        id = url
    return id

class GooglePlaySpider(BaseSpider):
    """Google Play movies spider.

    Attributes:
        name: Name of the crawler.
        allowed_domains: Allowed domains.
        categories: List of genre-id in Google Play.
        batch: Number of movie items to query at a time.
        entries: Maximum number of entries to fetch from each genre listing.
            (Googe Play only returns a maximum of 600 movie item.)
        mode: Mode of operation of the spider. 'normal' crawls the entire site.
            'test_movie_item' tests crawling a few movie items. 'test_listing'
            tests crawling a few movie listing pages.
        seen: Dictionary of movie-ids already crawled.
    """

    name = 'google-play'
    allowed_domains = [DOMAIN]
    categories = range(1, 11) + [13, 15, 18, 25, 26, 27]
    batch = 100
    entries = 600

    def __init__(self, *args, **kwargs):
        """Constructor.

        Initializes Google Play spider with normal mode or test mode.

        Args:
            args: Ignored.
            kwargs: If 'test' key is set, mode is changed to:
                'test_' + kwargs['test']
        """
        super(GooglePlaySpider, self).__init__(*args, **kwargs)
        self.mode = 'normal'
        self.seen = set()
        if kwargs.has_key('test'):
            self.mode = 'test_' + kwargs['test']
        else:
            self.mode = 'normal'

    def start_requests(self):
        """Returns seed URLs to start the crawling.

        Iterates through each genre and yields URLs for all the pages in the
        top-selling movies and new releases.

        Yields:
            Request() objects for all movie listing pages, which are processed
            by self.parse() method.
        """
        if self.mode == 'test_movie_item':
            for movie in SAMPLE_MOVIE:
                yield Request(movie, callback=self.parse_item)
        elif self.mode == 'test_listing':
            yield Request(SAMPLE_LISTING, callback=self.parse)
        else:
            for cat in self.categories:
                for start in xrange(0, self.entries, self.batch):
                    yield Request('{0}category/{1}/collection/topselling_paid?start={2}&num={3}'.format(
                        SITE_PREFIX, cat, start, self.batch), callback=self.parse)
                    yield Request('{0}category/{1}/collection/movers_shakers?start={2}&num={3}'.format(
                        SITE_PREFIX, cat, start, self.batch), callback=self.parse)

    def parse(self, response):
        """Parse a movie listing page.

        Args:
            response: Response object for the movie listing URL. response.body
                has the text of the response.

        Yields:
            Request() objects for all the movies listed in that page, which are
            processed by self.parse_item() method.
        """
        self.dump('listing', response.url, response.body)
        hxs = HtmlXPathSelector(response)
        for url in hxs.select('//div[@class="details"]/a/@href').extract():
            movie_id = GetID(url)
            if movie_id not in self.seen:
                self.seen.add(movie_id)
                yield Request(SITE_DOMAIN + url, callback=self.parse_item)

    def parse_item(self, response):
        """Parse a movie page.

        Args:
            response: Response object for the movie page. response.body has the
                text of the response.

        Yields:
            GooglePlayMovieItem() object with attributes parsed from the page.
        """
        self.dump('item', response.url, response.body)
        hxs = HtmlXPathSelector(response)
        d = hxs.select('//div[@class="details-info"]')[0]
        loader = XPathItemLoader(item=GooglePlayMovieItem(), selector=d)
        loader.add_xpath('name', '//div[@itemprop="name"]/div/text()')
        loader.add_value('id', GetID(response.url))
        _pub = d.select('//div[@itemprop="datePublished"]/text()').extract()[0].strip().split()
        loader.add_value('month', _pub[0])
        loader.add_value('year', _pub[1])
        loader.add_xpath('genre', '//span[@itemprop="genre"]/text()')
        loader.add_value('offers', self.parse_offers(d))
        loader.add_xpath('synopsis',
                '//div[contains(@class,"details-section description")]/meta[@itemprop="description"]/@content')
        _ratings = map(lambda x: int(x.replace(',', '')),
                d.select('//div[@class="rating-histogram"]//span[@class="bar-number"]/text()').extract())
        _ratings.reverse()
        loader.add_value('rating', _ratings)
        loader.add_xpath('actors', '//span[@itemprop="actor"]/a/span/text()')
        loader.add_xpath('producers', '//span[@itemprop="producer"]/a/span/text()')
        loader.add_xpath('writers', '//span[@itemprop="author"]/a/span/text()')
        loader.add_xpath('directors', '//span[@itemprop="director"]/a/span/text()')
        loader.add_xpath('content_rating','//div[@itemprop="contentRating"]/text()')
        loader.add_xpath('run_time', '//meta[@itemprop="duration"]/@content')
        loader.add_xpath('similar_movies', '//div[@class="details-section recommendation"]//a[@class="title"]/@title')
        loader.add_value('similar_movies_id',
                map(GetID,
                    d.select('//div[@class="details-section recommendation"]//a[@class="title"]/@href').extract()))
        return loader.load_item()

    def parse_offers(self, div):
        """Parse the offers for the movie.

        Args:
            div: <div> containing the movie offers information.

        Returns:
            List of 2-entry lists each containing the price and type of the
            offer.
        """
        price = div.select('(//button[@class="price buy"])[1]//span[@itemprop="offers"]/meta[@itemprop="price"]/@content').extract()
        desc = div.select('(//button[@class="price buy"])[1]//span[@itemprop="offers"]/meta[@itemprop="description"]/@content').extract()
        return zip(desc, price)

    def dump(self, type, url, body):
        """Dump the HTML file to disk.

        Args:
            type: Type of the URL.
            url: URL of the page.
            body: Contents of the URL.
        """
        directory = 'data/{0}/{1}'.format(self.name, type)
        MkDir(directory)
        filename = '{0}/{1}'.format(directory, self.dump_filename(url))
        with open(filename, 'w') as dump:
            dump.write(body)

    def dump_filename(self, url):
        """File to dump the HTML pages.

        Args:
            url: URL of the HTML page.

        Returns:
            Sane path from the URL replacesing '/' and  '?' with '_'.
        """
        url = url[len(SITE_DOMAIN)+1:].replace('/', '_').replace('?', '_')
        return url + '.html'