"""
Crawls movie data from iTunes website.

Crawls the webpages in iTunes Store (http://itunes.apple.com) to retreive the
movie data.  All webpages under the genres of our interest are crawled.

Usage:
    scrapy crawl itunes [options]

Examples:
    # Crawl all movies in Play.
    scrapy crawl itunes

    # Crawl all movies in Play and write them crawl movie items to
    # itunes.json in JSON format.
    scrapy crawl itunes -o itunes.json -t json

    # Test single movie items.
    scrapy crawl itunes -a test=movie_item

    # Test few movie listing pages.
    scrapy crawl itunes -a test=listing
"""
import errno
import os
import re

from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import MapCompose
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider

from movies.items import ItunesMovieItem

DOMAIN = 'itunes.apple.com'
SITE_DOMAIN = 'https://' + DOMAIN
SITE_PREFIX = SITE_DOMAIN + '/us/movie/'
SAMPLE_MOVIE_ITEMS = (
        'https://itunes.apple.com/us/movie/the-fault-in-our-stars/id899849704',
        'https://itunes.apple.com/us/movie/the-descendants/id485148916',
        'https://itunes.apple.com/us/movie/drive/id492170756',
        )
SAMPLE_MOVIE_LISTING = (
        'https://itunes.apple.com/us/genre/movies-action-adventure/id4401?letter=I&page=2#page',
        )

SITE_GENRE = SITE_DOMAIN + '/us/genre/movies-'
GENRES = (
        SITE_GENRE + 'action-adventure/id4401',
        SITE_GENRE + 'anime/id4402',
        SITE_GENRE + 'bollywood/id4431',
        SITE_GENRE + 'classics/id4403',
        SITE_GENRE + 'comedy/id4404',
        SITE_GENRE + 'documentary/id4405',
        SITE_GENRE + 'drama/id4406',
        SITE_GENRE + 'foreign/id4407',
        SITE_GENRE + 'holiday/id4420',
        SITE_GENRE + 'horror/id4408',
        SITE_GENRE + 'independent/id4409',
        SITE_GENRE + 'kids-family/id4410',
        SITE_GENRE + 'music-documentaries/id4423',
        SITE_GENRE + 'music-feature-films/id4424',
        SITE_GENRE + 'musicals/id4411',
        SITE_GENRE + 'regional-indian/id4432',
        SITE_GENRE + 'romance/id4412',
        SITE_GENRE + 'sci-fi-fantasy/id4413',
        SITE_GENRE + 'special-interest/id4415',
        SITE_GENRE + 'sports/id4417',
        SITE_GENRE + 'thriller/id4416',
        SITE_GENRE + 'urban/id4419',
        SITE_GENRE + 'western/id4418',
        )

def MkDir(path):
    """Creates a directory with the given path.

    Create the directory (possibly recursively) and ignore if it already exists.

    Args:
        path: A valid path to create a directory.
    """
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

def GetID(url):
    """Fetches the id of the movie from the url.

    Args:
        url: The url of the webpage for the movie.

    Returns:
        The id of the movie. For example:
        If url is https://itunes.apple.com/us/movie/the-fault-in-our-stars/id899849704
            id = 899849704
    """
    m = re.match('.*id(\d*)', url)
    return m.group(1)

class ItunesSpider(BaseSpider):
    """The Spider class which crawls iTunes movie data.

    Attributes:
        name: Name of the spider.
        allowed_domains: The domains this spider is allowed to crawl. 
        mode: Mode of operation of the spider. 'normal' crawls the entire site.
            'test_movie_item' tests crawling a few movie items. 'test_listing'
            tests crawling a few movie listing pages.
        seen: Dictionary of movie-ids already crawled.
    """

    name = "itunes"
    allowed_domains = [DOMAIN]

    def __init__(self, *args, **kwargs):
        """Constructor.

        Initializes iTunes spider with normal mode or test mode.

        Args:
            args: Ignored.
            kwargs: If 'test' key is set, mode is changed to:
                'test_' + kwargs['test']
        """
        super(ItunesSpider, self).__init__(*args, **kwargs)
        self.seen = set()
        if kwargs.has_key('test'):
            self.mode = 'test_' + kwargs['test']
        else:
            self.mode = 'normal'

    def start_requests(self):
        """Returns intial seed URLs to start crawling.

        Starts requesting pages under each genre in alphabetical order.

        Yields:
            Request() objects for all genres, which are processed by
            self.parse_pages() method.
        """
        if self.mode == 'test_movie_item':
            for url in SAMPLE_MOVIE_ITEMS:
                yield Request(url, callback=self.parse_item)
        elif self.mode == 'test_listing':
            for url in SAMPLE_MOVIE_LISTING:
                yield Request(url, callback=self.parse_listing)
        elif self.mode == 'test_genre':
            #genre = SITE_GENRE + 'classics/id4403'
            genre = SITE_GENRE + 'action-adventure/id4401'
            letters = map(chr, xrange(ord('A'), ord('A') + 26)) + [ '*' ]
            for letter in letters:
                letter_url = genre + '?letter=' + letter
                yield Request(letter_url, callback=self.parse_pages)
        elif self.mode == 'normal':
            for url in GENRES:
                letters = map(chr, xrange(ord('A'), ord('A') + 26)) + [ '*' ]
                for letter in letters:
                    letter_url = url + '?letter=' + letter
                    yield Request(letter_url, callback=self.parse_pages)

    def parse_pages(self, response):
        """Parse the genre page.

        Parse the genre page and return Request()'s for all pages under a
        particular alphabet.

        Args:
            response: Response object for the genre page. response.body has the
                text of the response.

        Yields:
            Request() objects for all the pages in that particular movie genre,
            which are processed by self.parse_listing() method.
        """
        self.dump('genre', response.url, response.body)
        hxs = HtmlXPathSelector(response)
        atags = hxs.select('//div[@id="selectedgenre"]/ul[@class="list paginate"][1]/li/a/@href')
        if len(atags) > 0:
            for url in atags.extract():
                yield Request(url, callback=self.parse_listing)
        else:
            # All entries in one page.
            for item in self.parse_listing(response):
                yield item

    def parse_listing(self, response):
        """Parse all movies within a particular page.

        Args:
            response: Response object for the movie listing page. response.body
                has the text of the response.

        Yields:
            Request() objects for all the movies in that movie listing page,
            which are processed by self.parse_item() method.
        """
        self.dump('listing', response.url, response.body)
        hxs = HtmlXPathSelector(response)
        atags = hxs.select('//div[@id="selectedcontent"]//ul/li/a/@href')
        for url in atags.extract():
            movie_id = GetID(url)
            if movie_id not in self.seen:
                self.seen.add(movie_id)
                yield Request(url, callback=self.parse_item)

    def parse_item(self, response):
        """Parse all the necessary information from a movie page.

        Args:
            response: Response object for the movie page. response.body has the
                text of the response.

        Yields:
            ItunesMovieItem() object with all the movie attributes parsed.
        """
        self.dump('item', response.url, response.body)
        hxs = HtmlXPathSelector(response)
        loader = XPathItemLoader(item=ItunesMovieItem(), selector=hxs)
        loader.add_value('id',
                GetID(hxs.select('//head/link[@rel="canonical"]/@href').extract()[0]))
        loader.add_xpath('name', '//div[@id="title"]/div/h1[1]/text()')
        loader.add_xpath('description', '//div[@class="plot-summary"]/p/text()')
        loader.add_xpath('actors', '//div[@metrics-loc="Titledbox_Actors"]/ul/li/a/text()')
        loader.add_xpath('directors', '//div[@metrics-loc="Titledbox_Director"]/ul/li/a/text()')
        loader.add_xpath('writers', '//div[@metrics-loc="Titledbox_Screenwriter"]/ul/li/a/text()')
        loader.add_xpath('producers', '//div[@metrics-loc="Titledbox_Producers"]/ul/li/a/text()')
        loader.add_xpath('price', '//span[@class="price"]/text()')
        loader.add_xpath('content_rating', '//span[@class="content-rating"]/text()')
        loader.add_xpath('genre', '//li[@class="genre"]/a/text()')
        loader.add_xpath('year', '//li[@class="release-date"]/text()')
        loader.add_xpath('copyright', '//li[@class="copyright"]/text()')
        loader.add_value('stars',
                len(hxs.select('//div[@id="left-stack"]//span[@class="rating-star"]'))*1.0 +
                len(hxs.select('//div[@id="left-stack"]//span[@class="rating-star half"]'))*0.5)
        loader.add_xpath('star_ratings',
                '//div[@id="left-stack"]//span[@class="rating-count"]/text()',
                re='(.*) Ratings')
        loader.add_xpath('rotten_tomatoes_tomatometer',
                '//div[@class="tomatometer"]/span[@class="percent"]/text()',
                MapCompose(unicode.strip))
        loader.add_xpath('rotten_tomatoes_reviews',
                '//div[@class="movie-review-left"]//li[@class="total-reviews"]/text()',
                MapCompose(unicode.strip))
        loader.add_xpath('rotten_tomatoes_fresh',
                '//div[@class="movie-review-left"]//li[@class="fresh-reviews"]/text()',
                MapCompose(unicode.strip))
        loader.add_xpath('rotten_tomatoes_rotten',
                '//div[@class="movie-review-left"]//li[@class="rotten-reviews"]/text()',
                MapCompose(unicode.strip))
        loader.add_xpath('rotten_tomatoes_average_rating',
                '//div[@class="movie-review-left"]//li[@class="average-reviews"]/text()',
                MapCompose(unicode.strip), re='(.*)/10')
        loader.add_xpath('similar_movies', '//div[@metrics-loc="Titledbox_Viewers Also Bought"]//a[@class="name"]/text()')
        loader.add_xpath('similar_movies_id',
                '//div[@metrics-loc="Titledbox_Viewers Also Bought"]//a[@class="name"]/@href',
                MapCompose(GetID))
        return loader.load_item()

    def dump(self, type, url, body):
        """ Dump the HTML content in the corresponding directories.

        Args:
            type: Type of the URL.
            url: URL of the page.
            body: Contents of the URL.
        """
        directory = 'data/{0}/{1}'.format(self.name, type)
        MkDir(directory)
        filename = '{0}/{1}'.format(directory, self.dump_filename(url))
        with open(filename, 'w') as dump:
            dump.write(body)

    def dump_filename(self, url):
        """Formats the filename before dumping.

        Args:
            url: URL of the HTML page.

        Returns:
            Sane path from the URL replacesing '/', '?', '&', and '=' with '_'.
        """
        url = url[len(SITE_DOMAIN)+1:].replace('/', '_').replace('?',
                '_').replace('&', '_').replace('=', '_')
        return url + '.html'