
Xc           @` s  d  Z  d d l m Z d d l m Z d d l m Z d d l Z d d l Z d d l Z d d l Z d d l	 Z
 d d l m Z d d l Z d d l
 Z d d l m Z e j j Z e j d e d	  e j d
 e d  e j d e d  e j d d d  e j d d d  e j d d d  e j d d d  e j d d d  e j d d d  e j d  d! d"  e j d# d! d$  e j d% d& d'  e j d( e d)  e j Z d* e f d+     YZ d, e f d-     YZ e d.  Z d/   Z  e! d0 k re j j"   n  d S(1   s  Multi-threaded word2vec unbatched skip-gram model.

Trains the model described in:
(Mikolov, et. al.) Efficient Estimation of Word Representations in Vector Space
ICLR 2013.
http://arxiv.org/abs/1301.3781
This model does true SGD (i.e. no minibatching). To do this efficiently, custom
ops are used to sequentially process data within a 'batch'.

The key ops used are:
* skipgram custom op that does input processing.
* neg_train custom op that efficiently calculates and applies the gradient using
  true SGD.
i    (   t   absolute_import(   t   division(   t   print_functionN(   t   xrange(   t   gen_word2vect	   save_paths   Directory to write the model.t
   train_datasG   Training data. E.g., unzipped file http://mattmahoney.net/dc/text8.zip.t	   eval_datasQ   Analogy questions. https://word2vec.googlecode.com/svn/trunk/questions-words.txt.t   embedding_sizei   s   The embedding dimension size.t   epochs_to_traini   sR   Number of epochs to train. Each epoch processes the training data once completely.t   learning_rateg?s   Initial learning rate.t   num_neg_samplesi   s&   Negative samples per training example.t
   batch_sizei  sC   Numbers of training examples each step processes (no minibatching).t   concurrent_stepsi   s(   The number of concurrent training steps.t   window_sizei   sH   The number of words to predict to the left and right of the target word.t	   min_countsO   The minimum number of word occurrences for it to be included in the vocabulary.t	   subsamplegMbP?s   Subsample threshold for word occurrence. Words that appear with higher frequency will be randomly down-sampled. Set to 0 to disable.t   interactives   If true, enters an IPython interactive session to play with the trained model. E.g., try model.analogy('france', 'paris', 'russia') and model.nearby(['proton', 'elephant', 'maxwell']t   Optionsc           B` s   e  Z d  Z d   Z RS(   s#   Options used by our word2vec model.c         C` s   t  j |  _ t  j |  _ t  j |  _ t  j |  _ t  j |  _ t  j |  _ t  j	 |  _	 t  j
 |  _
 t  j |  _ t  j |  _ t  j |  _ t  j |  _ d  S(   N(   t   FLAGSR   t   emb_dimR   R   t   num_samplesR
   R	   R   R   R   R   R   R   R   (   t   self(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   __init__\   s    (   t   __name__t
   __module__t   __doc__R   (    (    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR   Y   s   t   Word2Vecc           B` st   e  Z d  Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z	 d   Z
 d	   Z d
   Z d d  Z RS(   s   Word2Vec model (Skipgram).c         C` sP   | |  _  | |  _ i  |  _ g  |  _ |  j   |  j   |  j   |  j   d  S(   N(   t   _optionst   _sessiont   _word2idt   _id2wordt   build_grapht   build_eval_grapht
   save_vocabt   _read_analogies(   R   t   optionst   session(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR      s    				


c      	   C` s)  g  } d } t  |  j j d   } x | D] } | j d  rF q+ n  | j   j   j d  } g  | D] } |  j j | j    ^ qh } d | k s t
 |  d k r | d 7} q+ | j t j |   q+ WWd QXt d |  j j  t d	 t
 |   t d
 |  t j | d t j |  _ d S(   s   Reads through the analogy question file.

    Returns:
      questions: a [n, 4] numpy array containing the analogy question's
                 word ids.
      questions_skipped: questions skipped due to unknown words.
    i    t   rbt   :t    i   i   Ns   Eval analogy file: s   Questions: s	   Skipped: t   dtype(   t   openR   R   t
   startswitht   stript   lowert   splitR   t   gett   Nonet   lent   appendt   npt   arrayt   printt   int32t   _analogy_questions(   R   t	   questionst   questions_skippedt	   analogy_ft   linet   wordst   wt   ids(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR#      s    + c         C` sp  |  j  } t j d | j d | j d | j d | j d | j  \ } } } } } } } |  j j	 | | | g  \ | _
 | _ | _ t | j
  | _ t d | j  t d | j d d	  t d
 | j  | j
 |  _ x* t |  j  D] \ }	 }
 |	 |  j |
 <q Wt j t j | j | j g d | j d | j  d d } t j t j | j | j g  d d } t j d d d } t | j | j  } | j t j d d t j | t j  |  } | j d  } t j | g  8 t j  | | | | | d | j j!   d | j" } Wd QX| |  _# | |  _$ | |  _% | |  _& | |  _' | |  _( | |  _) | |  _* d S(   s   Build the model graph.t   filenameR   R   R   R   s   Data file: s   Vocab size: i   s    + UNKs   Words per epoch: g      g      ?t   namet   w_int   w_outi    t   global_stepg-C6?g      ?t   vocab_countt   num_negative_samplesN(+   R   t   word2vect   skipgramR   R   R   R   R   R   t   runt   vocab_wordst   vocab_countst   words_per_epochR1   t
   vocab_sizeR5   R   t	   enumerateR   t   tft   Variablet   random_uniformR   t   zerost   floatR	   R
   t   maximumt   castt   float32t
   assign_addt   control_dependenciest	   neg_traint   tolistR   t   _w_int	   _examplest   _labelst   _lrt   _traint   stept   _epocht   _words(   R   t   optsR<   t   countsRK   t   current_epocht   total_words_processedt   examplest   labelst   iR=   RA   RB   RC   t   words_to_traint   lrt   inct   train(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR       sT    				!-#	*!								c      
   C` s   |  j  } t t j j | j d  d  R } xH t | j  D]7 } | j d t	 j
 j | j |  | j | f  q= WWd QXd S(   s;   Save the vocabulary to a file so the model can be reloaded.s	   vocab.txtR=   s   %s %d
N(   R   R*   t   ost   patht   joinR   R   RL   t   writeRN   t   compatt   as_textRI   RJ   (   R   Rb   t   fRh   (    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR"      s
    	$c         C` s  |  j  } t j d t j  } t j d t j  } t j d t j  } t j j |  j d  } t j | |  } t j | |  } t j | |  } | | | }	 t j |	 | d t	 }
 t j j
 |
 d  \ } } t j d t j  } t j | |  } t j | | d t	 } t j j
 | t d | j   \ } } | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ | |  _ t j   j   t j j   |  _ d S(   s   Build the evaluation graph.R)   i   t   transpose_bi   i  N(   R   RN   t   placeholderR6   t   nnt   l2_normalizeRZ   t   gathert   matmult   Truet   top_kt   minRL   t
   _analogy_at
   _analogy_bt
   _analogy_ct   _analogy_pred_idxt   _nearby_wordt   _nearby_valt   _nearby_idxt   initialize_all_variablesRH   Rl   t   Savert   saver(   R   Rb   t	   analogy_at	   analogy_bt	   analogy_ct   nembt   a_embt   b_embt   c_embt   targett   distt   _t   pred_idxt   nearby_wordt
   nearby_embt   nearby_distt
   nearby_valt
   nearby_idx(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR!      s2    									c         C` s`   |  j  j |  j g  \ } x> t r[ |  j  j |  j |  j g  \ } } | | k r Pq q Wd  S(   N(   R   RH   R`   Rz   R^   (   R   t   initial_epochR   t   epoch(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   _train_thread_body1  s
    	$c         C` sd  |  j  } |  j j |  j |  j g  \ } } g  } xC t | j  D]2 } t j d |  j	  } | j
   | j |  qC W| t j   } } x t rDt j d  |  j j |  j |  j |  j |  j g  \ }	 }
 } } t j   } | | | | | | } } } t d |	 |
 | | f d d t j j   |	 | k r Pq q Wx | D] } | j   qLWd S(   s   Train the model.R   i   s1   Epoch %4d Step %8d: lr = %5.3f words/sec = %8.0ft   endt    N(   R   R   RH   R`   Ra   R   R   t	   threadingt   ThreadR   t   startR2   t   timeRz   t   sleepR_   R]   R5   t   syst   stdoutt   flushRo   (   R   Rb   R   t   initial_wordst   workersR   t   tt
   last_wordst	   last_timeR   R_   R<   Rj   t   nowt   rate(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyRl   8  s,    	$
	6c         C` sp   |  j  j |  j g i | d d  d f |  j 6| d d  d f |  j 6| d d  d f |  j 6 \ } | S(   s0   Predict the top 4 answers for analogy questions.Ni    i   i   (   R   RH   R   R}   R~   R   (   R   t   analogyt   idx(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   _predictV  s
    #c   	      C` s'  d } |  j  j d } d } x | | k  r | d } |  j  | |  d d  f } |  j |  } | } x t | j d  D]x } xo t d  D]a } | | | f | | d f k r | d 7} Pq | | | f | | d d  f k r q q Pq Wq} Wq Wt   t d | | | d | f  d S(	   s0   Evaluate analogy questions and reports accuracy.i    i	  Ni   i   i   s   Eval %4d/%d accuracy = %4.1f%%g      Y@(   R7   t   shapeR   R   R5   (	   R   t   correctt   totalR   t   limitt   subR   t   questiont   j(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   eval_  s&    
 
&c   	      C` s   t  j g  | | | g D] } |  j j | d  ^ q g  } |  j |  } xQ g  | d d d  f D] } |  j | ^ qf D] } | | | | g k r} | Sq} Wd S(   s%   Predict word w3 as in w0:w1 vs w2:w3.i    Nt   unknown(   R3   R4   R   R/   R   R   (	   R   t   w0t   w1t   w2R=   t   widR   Rh   t   c(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR   |  s    =7i   c   
      C` s   t  j g  | D] } |  j j | d  ^ q  } |  j j |  j |  j g i | |  j 6 \ } } x t	 t
 |   D]s } t d | |  xX t | | d |  f | | d |  f  D]' \ } }	 t d |  j | |	 f  q Wqu Wd S(   s.   Prints out nearby words given a list of words.i    s)   
%s
=====================================Ns   %-20s %6.4f(   R3   R4   R   R/   R   RH   R   R   R   R   R1   R5   t   zipR   (
   R   R<   t   numt   xR>   t   valsR   Rh   t   neighbort   distance(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   nearby  s    1	(<(   R   R   R   R   R#   R    R"   R!   R   Rl   R   R   R   R   (    (    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyR      s   	
		>		;							c         C` sR   d d  l  } i  } |  r( | j |   n  | j t    | j d g  d |  d  S(   Ni    t   argvt   user_ns(   t   IPythont   updatet   globalst   start_ipython(   t   local_nsR   R   (    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   _start_shell  s    c         C` s   t  j s t  j s t  j r8 t d  t j d  n  t   } t j	   j
    t j    } t | |  } x+ t | j  D] }  | j   | j   q W| j j | t j j | j d  d | j t  j r t t    n  Wd QXWd QXd S(   s   Train a word2vec model.s;   --train_data --eval_data and --save_path must be specified.i   s
   model.ckptRC   N(   R   R   R   R   R5   R   t   exitR   RN   t   Grapht
   as_defaultt   SessionR   R   R	   Rl   R   R   t   saveRm   Rn   Ro   R_   R   R   t   locals(   R   Rb   R%   t   model(    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   main  s    
	 
$
	t   __main__(#   R   t
   __future__R    R   R   Rm   R   R   R   t   tensorflow.python.platformt
   tensorflowt	   six.movesR   t   numpyR3   RN   t   tensorflow.models.embeddingR   RF   t   appt   flagst   DEFINE_stringR0   t   DEFINE_integert   DEFINE_floatt   DEFINE_booleant   FalseR   t   objectR   R   R   R   R   RH   (    (    (    sp   /tmp/pip-build-UG86a1/tensorflow/tensorflow-0.6.0.data/purelib/tensorflow/models/embedding/word2vec_optimized.pyt   <module>   s^   	1 
	