ó
u›XQc        
   @   sÏ  d  Z  d g Z d d l m Z m Z d d l Z d d l Z e j d  \ Z Z Z	 e d k p˜ e d k rw e d k p˜ e d k o˜ e d k o˜ e	 d k Z
 d d l m Z m Z m Z m Z m Z d d l m Z m Z d d	 l m Z m Z m Z d
 Z d e f d „  ƒ  YZ d e f d „  ƒ  YZ e d k rËe d k rËe
 rËd d l Z e j d ƒ Z e e _ e j d e j ƒ Z e e _ d d l  m! Z! m" Z" d „  Z# d „  Z$ e# e _# e$ e _$ e% Z
 n  d S(   sC   Use the HTMLParser library to parse HTML files that aren't too bad.t   HTMLParserTreeBuilderiÿÿÿÿ(   t
   HTMLParsert   HTMLParseErrorNi   i   (   t   CDatat   Commentt   Declarationt   Doctypet   ProcessingInstruction(   t   EntitySubstitutiont   UnicodeDammit(   t   HTMLt   HTMLTreeBuildert   STRICTs   html.parsert   BeautifulSoupHTMLParserc           B   sY   e  Z d  „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z d „  Z	 d „  Z
 RS(	   c         C   s#   |  j  j | d  d  t | ƒ ƒ d  S(   N(   t   soupt   handle_starttagt   Nonet   dict(   t   selft   namet   attrs(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyR   .   s    c         C   s   |  j  j | ƒ d  S(   N(   R   t   handle_endtag(   R   R   (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyR   2   s    c         C   s   |  j  j | ƒ d  S(   N(   R   t   handle_data(   R   t   data(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyR   5   s    c         C   sy   | j  d ƒ r* t | j d ƒ d ƒ } n t | ƒ } y t | ƒ } Wn t t f k
 rg } d } n X|  j | ƒ d  S(   Nt   xi   u   ï¿½(   t
   startswitht   intt   lstript   unichrt
   ValueErrort   OverflowErrorR   (   R   R   t	   real_nameR   t   e(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   handle_charref8   s    
c         C   sB   t  j j | ƒ } | d  k	 r' | } n
 d | } |  j | ƒ d  S(   Ns   &%s;(   R   t   HTML_ENTITY_TO_CHARACTERt   getR   R   (   R   R   t	   characterR   (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   handle_entityrefG   s
    	
c         C   s1   |  j  j ƒ  |  j  j | ƒ |  j  j t ƒ d  S(   N(   R   t   endDataR   R   (   R   R   (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   handle_commentO   s    c         C   sS   |  j  j ƒ  | j d ƒ r/ | t d ƒ } n  |  j  j | ƒ |  j  j t ƒ d  S(   Ns   DOCTYPE (   R   R&   R   t   lenR   R   (   R   R   (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   handle_declT   s
    c         C   se   | j  ƒ  j d ƒ r. t } | t d ƒ } n t } |  j j ƒ  |  j j | ƒ |  j j | ƒ d  S(   Ns   CDATA[(   t   upperR   R   R(   R   R   R&   R   (   R   R   t   cls(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   unknown_decl[   s    c         C   sb   |  j  j ƒ  | j d ƒ r> | j ƒ  j d ƒ r> | d  } n  |  j  j | ƒ |  j  j t ƒ d  S(   Nt   ?t   xmliÿÿÿÿ(   R   R&   t   endswitht   lowerR   R   R   (   R   R   (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt	   handle_pie   s
    $(   t   __name__t
   __module__R   R   R   R!   R%   R'   R)   R,   R1   (    (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyR   -   s   								
c           B   s>   e  Z e Z e e e g Z d  „  Z d d d „ Z
 d „  Z RS(   c         O   s&   t  r t | d <n  | | f |  _ d  S(   Nt   strict(   t   CONSTRUCTOR_TAKES_STRICTt   Falset   parser_args(   R   t   argst   kwargs(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   __init__x   s    c         C   s\   t  | t ƒ r | d d t f S| | g } t | | d t ƒ} | j | j | j | j	 f S(   s¸   
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        t   is_htmlN(
   t
   isinstancet   unicodeR   R6   R	   t   Truet   markupt   original_encodingt   declared_html_encodingt   contains_replacement_characters(   R   R?   t   user_specified_encodingt   document_declared_encodingt   try_encodingst   dammit(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   prepare_markup}   s    c         C   sn   |  j  \ } } t | | Ž  } |  j | _ y | j | ƒ Wn, t k
 ri } t j t d ƒ ƒ | ‚ n Xd  S(   Ns*  Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.(   R7   R   R   t   feedR   t   warningst   warnt   RuntimeWarning(   R   R?   R8   R9   t   parserR    (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyRH      s    	
N(   R2   R3   R6   t   is_xmlR
   R   t
   HTMLPARSERt   featuresR:   R   RG   RH   (    (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyR    s   s   	sQ   \s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?sê  
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
(   t   tagfindt   attrfindc         C   sÈ  d  |  _ |  j | ƒ } | d k  r( | S|  j } | | | !|  _ g  } t j | | d ƒ } | sl t d ‚ | j ƒ  } | | d | !j ƒ  |  _	 } x| | k  r¤|  j
 rÃ t j | | ƒ } n t j | | ƒ } | sß Pn  | j d d d ƒ \ }	 }
 } |
 sd  } nX | d  d k o+| d k n sT| d  d k oO| d k n rd| d d !} n  | r||  j | ƒ } n  | j |	 j ƒ  | f ƒ | j ƒ  } q™ W| | | !j ƒ  } | d k rs|  j ƒ  \ } } d |  j k r| |  j j d ƒ } t |  j ƒ |  j j d ƒ } n | t |  j ƒ } |  j
 r[|  j d | | | !d  f ƒ n  |  j | | | !ƒ | S| j d
 ƒ r•|  j | | ƒ n/ |  j | | ƒ | |  j k rÄ|  j | ƒ n  | S(   Ni    i   s#   unexpected call to parse_starttag()i   i   s   'iÿÿÿÿt   "t   >s   />s   
s    junk characters in start tag: %ri   (   RS   s   />(   R   t   __starttag_textt   check_for_whole_start_tagt   rawdataRP   t   matcht   AssertionErrort   endR0   t   lasttagR4   RQ   t   attrfind_tolerantt   groupt   unescapet   appendt   stript   getpost   countR(   t   rfindt   errorR   R/   t   handle_startendtagR   t   CDATA_CONTENT_ELEMENTSt   set_cdata_mode(   R   t   it   endposRV   R   RW   t   kt   tagt   mt   attrnamet   restt	   attrvalueRY   t   linenot   offset(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   parse_starttag¶   s\    				$$		c         C   s2   | j  ƒ  |  _ t j d |  j t j ƒ |  _ d  S(   Ns   </\s*%s\s*>(   R0   t
   cdata_elemt   ret   compilet   It   interesting(   R   t   elem(    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyRf   í   s    (&   t   __doc__t   __all__R   R   t   sysRI   t   version_infot   majort   minort   releaseR5   t   bs4.elementR   R   R   R   R   t
   bs4.dammitR   R	   t   bs4.builderR
   R   R   RN   R   R    Rs   Rt   R[   t   VERBOSEt   locatestarttagendt   html.parserRP   RQ   Rq   Rf   R>   (    (    (    s<   /scratch/rashmi/Condor_Script/src/bs4/builder/_htmlparser.pyt   <module>   s8   		$(F*				7			