ó
¾÷Xc           @`  sØ   d  Z  d d l m Z d d l m Z d d l Z d d l Z d d l Z d d l m	 Z	 d d l m
 Z
 d d l Z e j d k  r‘ e j Z n	 e j Z d e d	 d
 „ Z d e d	 d „ Z d e f d „  ƒ  YZ d S(   sQ   Utilities for text input preprocessing.

May benefit from a fast Cython rewrite.
i    (   t   absolute_import(   t   divisionN(   t   range(   t   zipi   s!   !"#$%&()*+,-./:;<=>?@[\]^_`{|}~	
t    c         C`  sc   | r |  j  ƒ  }  n  |  j t | | t | ƒ ƒ ƒ }  |  j | ƒ } g  | D] } | rM | ^ qM S(   s>  Converts a text to a sequence of word indices.

    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out.
        lower: Whether to convert the input to lowercase.
        split: Sentence split marker (string).

    # Returns
        A list of integer word indices.
    (   t   lowert	   translatet	   maketranst   lent   split(   t   textt   filtersR   R	   t   seqt   i(    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   text_to_word_sequence   s
    "c         C`  sM   t  |  d | d | d | ƒ} g  | D]$ } t t | ƒ ƒ | d d ^ q% S(   NR   R   R	   i   (   R   t   abst   hash(   R
   t   nR   R   R	   R   t   w(    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   one_hot+   s
    		t	   Tokenizerc           B`  sb   e  Z d  Z d d e d e d „ Z d „  Z d „  Z d „  Z	 d „  Z
 d d	 „ Z d d
 „ Z RS(   sÇ  Text tokenization utility class.

    This class allows to vectorize a text corpus, by turning each
    text into either a sequence of integers (each integer being the index
    of a token in a dictionary) or into a vector where the coefficient
    for each token could be binary, based on word count, based on tf-idf...

    # Arguments
        num_words: the maximum number of words to keep, based
            on word frequency. Only the most common `num_words` words will
            be kept.
        filters: a string where each element is a character that will be
            filtered from the texts. The default is all punctuation, plus
            tabs and line breaks, minus the `'` character.
        lower: boolean. Whether to convert the texts to lowercase.
        split: character or string to use for token splitting.
        char_level: if True, every character will be treated as a word.

    By default, all punctuation is removed, turning the texts into
    space-separated sequences of words
    (words maybe include the `'` character). These sequences are then
    split into lists of tokens. They will then be indexed or vectorized.

    `0` is a reserved index that won't be assigned to any word.
    s!   !"#$%&()*+,-./:;<=>?@[\]^_`{|}~	
R   c         K`  s–   d | k r+ t  j d ƒ | j d ƒ } n  | rJ t d t | ƒ ƒ ‚ n  i  |  _ i  |  _ | |  _ | |  _ | |  _	 | |  _
 d |  _ | |  _ d  S(   Nt   nb_wordssD   The `nb_words` argument in `Tokenizer` has been renamed `num_words`.s    Unrecognized keyword arguments: i    (   t   warningst   warnt   popt	   TypeErrort   strt   word_countst	   word_docsR   R	   R   t	   num_wordst   document_countt
   char_level(   t   selfR   R   R   R	   R   t   kwargs(    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   __init__Q   s    							c   	   	   C`  s«  d |  _  xÖ | D]Î } |  j  d 7_  |  j r4 | n t | |  j |  j |  j ƒ } x@ | D]8 } | |  j k r„ |  j | c d 7<qY d |  j | <qY WxF t | ƒ D]8 } | |  j k rÍ |  j | c d 7<q¢ d |  j | <q¢ Wq Wt	 |  j j
 ƒ  ƒ } | j d d „  d t ƒ g  | D] } | d ^ q} t t	 t | t	 t d t | ƒ d ƒ ƒ ƒ ƒ ƒ |  _ i  |  _ x7 t	 |  j j
 ƒ  ƒ D]  \ } } | |  j |  j | <qƒWd S(   s  Updates internal vocabulary based on a list of texts.

        Required before using `texts_to_sequences` or `texts_to_matrix`.

        # Arguments
            texts: can be a list of strings,
                or a generator of strings (for memory-efficiency)
        i    i   t   keyc         S`  s   |  d S(   Ni   (    (   t   x(    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   <lambda>„   s    t   reverseN(   R   R   R   R   R   R	   R   t   setR   t   listt   itemst   sortt   Truet   dictR   R   R   t
   word_indext
   index_docs(	   R    t   textsR
   R   R   t   wcountst   wct
   sorted_voct   c(    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   fit_on_textsh   s,    		7	"c         C`  s|   t  | ƒ |  _ i  |  _ x] | D]U } t | ƒ } x@ | D]8 } | |  j k r] d |  j | <q8 |  j | c d 7<q8 Wq Wd S(   s%  Updates internal vocabulary based on a list of sequences.

        Required before using `sequences_to_matrix`
        (if `fit_on_texts` was never called).

        # Arguments
            sequences: A list of sequence.
                A "sequence" is a list of integer word indices.
        i   N(   R   R   R.   R'   (   R    t	   sequencesR   R   (    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   fit_on_sequences   s    
	c         C`  s1   g  } x$ |  j  | ƒ D] } | j | ƒ q W| S(   sL  Transforms each text in texts in a sequence of integers.

        Only top "num_words" most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Returns
            A list of sequences.
        (   t   texts_to_sequences_generatort   append(   R    R/   t   rest   vect(    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   texts_to_sequences¡   s    c         c`  s­   |  j  } x | D]• } |  j r% | n t | |  j |  j |  j ƒ } g  } xT | D]L } |  j j | ƒ } | d k	 rP | rŒ | | k rŒ qP qœ | j	 | ƒ qP qP W| Vq Wd S(   sS  Transforms each text in texts in a sequence of integers.

        Only top "num_words" most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Yields
            Yields individual sequences.
        N(
   R   R   R   R   R   R	   R-   t   gett   NoneR8   (   R    R/   R   R
   R   R:   R   R   (    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyR7   ²   s    	t   binaryc         C`  s"   |  j  | ƒ } |  j | d | ƒS(   sØ   Convert a list of texts to a Numpy matrix.

        # Arguments
            texts: list of strings.
            mode: one of "binary", "count", "tfidf", "freq".

        # Returns
            A Numpy matrix.
        t   mode(   R;   t   sequences_to_matrix(   R    R/   R?   R5   (    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   texts_to_matrixÎ   s    
c      
   C`  só  |  j  s7 |  j r( t |  j ƒ d } q@ t d ƒ ‚ n	 |  j  } | d k re |  j re t d ƒ ‚ n  t j t | ƒ | f ƒ } xlt | ƒ D]^\ } } | s¥ q n  i  } xI | D]A } | | k rÊ q² n  | | k rã d | | <q² | | c d 7<q² Wxñ t | j	 ƒ  ƒ D]Ý \ } }	 | d k r3|	 | | | <q
| d k rZ|	 t | ƒ | | | <q
| d k rwd | | | <q
| d k rØd t j
 |	 ƒ }
 t j
 d |  j d |  j j | d	 ƒ ƒ } |
 | | | | <q
t d
 | ƒ ‚ q
Wq W| S(   s¸  Converts a list of sequences into a Numpy matrix.

        # Arguments
            sequences: list of sequences
                (a sequence is a list of integer word indices).
            mode: one of "binary", "count", "tfidf", "freq"

        # Returns
            A Numpy matrix.

        # Raises
            ValueError: In case of invalid `mode` argument,
                or if the Tokenizer requires to be fit to sample data.
        i   sI   Specify a dimension (num_words argument), or fit on some text data first.t   tfidfs7   Fit the Tokenizer on some data before using tfidf mode.g      ð?t   countt   freqR>   i    s   Unknown vectorization mode:(   R   R-   R   t
   ValueErrorR   t   npt   zerost	   enumerateR(   R)   t   logR.   R<   (   R    R5   R?   R   R$   R   R   t   countst   jR3   t   tft   idf(    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyR@   Û   s@    			N(   t   __name__t
   __module__t   __doc__R=   R+   t   FalseR"   R4   R6   R;   R7   RA   R@   (    (    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyR   6   s   	%			(   i   (   RP   t
   __future__R    R   t   stringt   syst   numpyRF   t	   six.movesR   R   R   t   version_infoR   R   R+   R   R   t   objectR   (    (    (    s7   /tmp/pip-build-isqEY4/keras/keras/preprocessing/text.pyt   <module>   s"   	