
uXQc           @   s*  d  Z  d d l Z d d l m Z d d l Z d d l Z d Z y d d l Z d   Z	 WnG e
 k
 r y d d l Z d   Z	 Wq e
 k
 r d   Z	 q Xn Xy d d l Z Wn e
 k
 r n Xe j d j   e j  Z e j d j   e j  Z d	 e f d
     YZ d d d     YZ d S(   s&  Beautiful Soup bonus library: Unicode, Dammit

This class forces XML data into a standard format (usually to UTF-8 or
Unicode).  It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It does not rewrite the XML or HTML to reflect a new
encoding; that's the tree builder's job.
iN(   t   codepoint2namec         C   s   t  j |   d S(   Nt   encoding(   t   cchardett   detect(   t   s(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   chardet_dammit   s    c         C   s   t  j |   d S(   NR   (   t   chardetR   (   R   (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyR      s    c         C   s   d  S(   N(   t   None(   R   (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyR   #   s    s!   ^<\?.*encoding=['"](.*?)['"].*\?>s0   <\s*meta[^>]+charset\s*=\s*["']?([^>]*?)[ /;'">]t   EntitySubstitutionc           B   s   e  Z d  Z d   Z e   \ Z Z Z i d d 6d d 6d d 6d d	 6d
 d 6Z e j	 d  Z
 e d    Z e d    Z e d    Z e e d   Z e d    Z RS(   sA   Substitute XML or HTML entities for the corresponding characters.c          C   s   i  }  i  } g  } x\ t  t j    D]H \ } } t |  } | d k rc | j |  | |  | <n  | | | <q% Wd d j |  } |  | t j |  f S(   Ni"   s   [%s]t    (   t   listR    t   itemst   unichrt   appendt   joint   ret   compile(   t   lookupt   reverse_lookupt   characters_for_ret	   codepointt   namet	   charactert   re_definition(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   _populate_class_variables5   s    t   apost   't   quott   "t   ampt   &t   ltt   <t   gtt   >s&   ([<>]|&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;))c         C   s#   |  j  j | j d   } d | S(   Ni    s   &%s;(   t   CHARACTER_TO_HTML_ENTITYt   gett   group(   t   clst   matchobjt   entity(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   _substitute_html_entityT   s    c         C   s   |  j  | j d  } d | S(   sm   Used with a regular expression to substitute the
        appropriate XML entity for an XML special character.i    s   &%s;(   t   CHARACTER_TO_XML_ENTITYR%   (   R&   R'   R(   (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   _substitute_xml_entityY   s    c         C   sN   d } d | k rB d | k r9 d } | j  d |  } qB d } n  | | | S(   s*  Make a value into a quoted XML attribute, possibly escaping it.

         Most strings will be quoted using double quotes.

          Bob's Bar -> "Bob's Bar"

         If a string contains double quotes, it will be quoted using
         single quotes.

          Welcome to "my bar" -> 'Welcome to "my bar"'

         If a string contains both single and double quotes, the
         double quotes will be escaped, and the string will be quoted
         using double quotes.

          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
        R   R   s   &quot;(   t   replace(   t   selft   valuet
   quote_witht   replace_with(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   quoted_attribute_value`   s    	c         C   s4   |  j  j |  j |  } | r0 |  j |  } n  | S(   s  Substitute XML entities for special XML characters.

        :param value: A string to be substituted. The less-than sign will
          become &lt;, the greater-than sign will become &gt;, and any
          ampersands that are not part of an entity defition will
          become &amp;.

        :param make_quoted_attribute: If True, then the string will be
         quoted, as befits an attribute value.
        (   t   BARE_AMPERSAND_OR_BRACKETt   subR+   R1   (   R&   R.   t   make_quoted_attribute(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   substitute_xml   s
    	c         C   s   |  j  j |  j |  S(   s  Replace certain Unicode characters with named HTML entities.

        This differs from data.encode(encoding, 'xmlcharrefreplace')
        in that the goal is to make the result more readable (to those
        with ASCII displays) rather than to recover from
        errors. There's absolutely nothing wrong with a UTF-8 string
        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
        character with "&eacute;" will make it more readable to some
        people.
        (   t   CHARACTER_TO_HTML_ENTITY_RER3   R)   (   R&   R   (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   substitute_html   s    	(   t   __name__t
   __module__t   __doc__R   R#   t   HTML_ENTITY_TO_CHARACTERR6   R*   R   R   R2   t   classmethodR)   R+   R1   t   FalseR5   R7   (    (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyR   1   s    	
%t   UnicodeDammitc           B   s  e  Z d  Z i d d 6d d 6Z d d d g Z g  de d  Z d	   Z d
 d  Z	 d
 d  Z
 e d  Z d   Z d   Z dZ d   Z i  dd 6d d 6dd 6dd 6d	d 6d
d! 6dd$ 6dd' 6dd* 6dd- 6dd0 6dd3 6dd6 6d7 d8 6dd; 6d7 d< 6d7 d= 6dd@ 6ddC 6ddF 6ddI 6ddL 6ddO 6ddR 6ddU 6ddX 6dd[ 6dd^ 6dda 6d7 db 6dde 6d dh 6Z i di d 6d d 6dj d 6dk d 6dl d 6dm d! 6dn d$ 6do d' 6dp d* 6dq d- 6dr d0 6ds d3 6dt d6 6d7 d8 6du d; 6d7 d< 6d7 d= 6dv d@ 6dv dC 6dw dF 6dw dI 6dx dL 6dy dO 6dz dR 6d{ dU 6d| dX 6d} d[ 6d~ d^ 6d da 6d7 db 6d de 6d dh 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6dr d 6d d 6dg d 6d d 6d d 6d d 6d d 6d d 6dy d 6d d 6d d 6d d 6d d 6d!d 6d d 6d d 6dx d 6dj d 6d d 6d d 6d d 6d d 6d d 6d d 6d7 d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6dx d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d 6d d6d d6d d6d d6d d6dd6d d6dd	6Z iz d
d6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6d d!6d"d#6d$d%6d&d'6d(d)6d*d+6d,d-6d.d/6d0d16d2d36d4d56d6d76d8d96d:d;6d<d=6d>d?6d@dA6dBdC6dDdE6dFdG6dHdI6dJdK6dLdM6dNdO6dPdQ6dRdS6dTdU6dVdW6dXdY6dZd[6d\d]6d^d_6d`da6dbdc6ddde6dfdg6dhdi6djdk6dldm6dndo6dpdq6drds6dtdu6dvdw6dxdy6dzd{6d|d}6d~d6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6d d6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6dd6Z d"d#d$g Z e d d Z e ddZ e dd d  Z RS(%  s   A class for detecting the encoding of a *ML document and
    converting it to a Unicode string. If the source encoding is
    windows-1252, can replace MS smart quotes with their HTML or XML
    equivalents.s	   mac-romant	   macintoshs	   shift-jiss   x-sjiss   windows-1252s
   iso-8859-1s
   iso-8859-2c   
      C   s  d  |  _ | |  _ g  |  _ t |  _ | d k s? t | t  rd | |  _ t |  |  _	 d  |  _
 d  S|  j | |  \ } } } | |  _ d  } | | k r |  j d  } | |  _
 n  | sxC | | | g D]. }	 |	 d  k	 r |  j |	  } | r Pq q q Wn  | r6t |  j t  r6|  j t |  j   } n  | six* d D] }	 |  j |	  } | rCPqCqCWn  | sxh | | | d d g D]M }	 |	 d k r|  j |	 d  } n  | d  k	 rt j d  t |  _ PqqWn  | |  _	 | sd  |  _
 n  d  S(	   NR	   t   utf8s   utf-8s   windows-1252t   asciiR,   sS   Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.(   s   utf-8s   windows-1252(   R   t   declared_html_encodingt   smart_quotes_tot   tried_encodingsR=   t   contains_replacement_characterst
   isinstancet   unicodet   markupt   unicode_markupt   original_encodingt   _detectEncodingt   _convert_fromR   t   loggingt   warningt   True(
   R-   RH   t   override_encodingsRC   t   is_htmlt
   new_markupt   document_encodingt   sniffed_encodingt   ut   proposed_encoding(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   __init__   sV    									c         C   s   | j  d  } |  j d k r9 |  j j |  j   } n |  j j |  } t |  t k r |  j d k r d j   | d j   d j   } q d j   | d j   d j   } n | j   } | S(   s[   Changes a MS smart quote character to an XML or HTML
        entity, or an ASCII character.i   RA   t   xmls   &#xt   ;R   i    (   R%   RC   t   MS_CHARS_TO_ASCIIR$   t   encodet   MS_CHARSt   typet   tuple(   R-   t   matcht   origR3   (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   _sub_ms_char   s    ''t   strictc         C   s   |  j  |  } | s+ | | f |  j k r/ d  S|  j j | | f  |  j } |  j d  k	 r | j   |  j k r d } t j	 |  } | j
 |  j |  } n  y+ |  j | | |  } | |  _ | |  _ Wn t k
 r } d  SX|  j S(   Ns   ([-])(   t
   find_codecRD   R   R   RH   RC   t   lowert   ENCODINGS_WITH_SMART_QUOTESR   R   R3   Ra   t   _to_unicodeRJ   t	   Exception(   R-   t   proposedt   errorsRH   t   smart_quotes_ret   smart_quotes_compiledRU   t   e(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyRL     s"    		c         C   s  t  |  d k rH | d  d k rH | d d !d k rH d } | d } n t  |  d k r | d  d k r | d d !d k r d } | d } ni | d  d	 k r d
 } | d } nF | d  d k r d } | d } n# | d  d k r d } | d } n  t | | |  } | S(   s   Given a string and its encoding, decodes the string into Unicode.
        %encoding is a string recognized by encodings.aliasesi   i   s   t     s   utf-16bes   s   utf-16lei   s   ﻿s   utf-8t     s   utf-32bes     s   utf-32le(   t   lenRG   (   R-   t   dataR   Ri   t   newdata(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyRf   *  s&    ""c         C   s  d" } } yC| d  d k r/ |  j |  } n| d  d k r` d } t | d  j d  } nt |  d k r | d  d k r | d d !d k r d } t | d d  j d  } n| d  d	 k r d
 } t | d
  j d  } nat |  d k rE| d  d k rE| d d !d k rEd
 } t | d d
  j d  } n| d  d k rvd } t | d  j d  } n | d  d k rd } t | d  j d  } n | d  d k rd } t | d d  j d  } np | d  d k rd } t | d d  j d  } n; | d  d k rFd } t | d d  j d  } n d } Wn d" } n Xt j |  } | r| rt j |  } n  | d" k	 r| j	   d j
 d  j   } | r| |  _ n  | r| d# k r| } qn  | | | f S($   s3   Given a document, tries to detect its XML encoding.i   s   Lot    < ?s   utf-16bes   utf-8i   s   Rm   s   < ? s   utf-16les   t      <s   utf-32bes   <   s   utf-32leRn   s     i   s   ﻿RA   i    s   iso-10646-ucs-2s   ucs-2t	   csunicodes   iso-10646-ucs-4s   ucs-4t   csucs4s   utf-16s   utf-32t   utf_16t   utf_32t   utf16t   u16N(   s   iso-10646-ucs-2s   ucs-2Rt   s   iso-10646-ucs-4s   ucs-4Ru   s   utf-16s   utf-32s   utf_16s   utf_32s   utf16s   u16(   R   t   _ebcdic_to_asciiRG   R[   Ro   t   xml_encoding_reR_   t   html_meta_ret   searcht   groupst   decodeRd   RB   (   R-   t   xml_dataRQ   t   xml_encodingt   sniffed_xml_encodingt   xml_encoding_match(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyRK   C  sh    
""
  	c         C   sd   |  j  |  j j | |   pc | r? |  j  | j d d   pc | r` |  j  | j d d   pc | S(   Nt   -R	   t   _(   t   _codect   CHARSET_ALIASESR$   R,   (   R-   t   charset(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyRc     s    !!c         C   sE   | s
 | Sd  } y t j |  | } Wn t t f k
 r@ n X| S(   N(   R   t   codecsR   t   LookupErrort
   ValueError(   R-   R   t   codec(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyR     s    
c         C   s|   |  j  } | j sl d} dd  l } | j dj t t t t d    dj t t |    | _ n  | j	 | j  S(  Ni    i   i   i   i   i	   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i
   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i    i   i   i   i   i   i   i   i   i   i[   i.   i<   i(   i+   i!   i&   i   i   i   i   i   i   i   i   i   i]   i$   i*   i)   i;   i^   i-   i/   i   i   i   i   i   i   i   i   i|   i,   i%   i_   i>   i?   i   i   i   i   i   i   i   i   i   i`   i:   i#   i@   i'   i=   i"   i   ia   ib   ic   id   ie   if   ig   ih   ii   i   i   i   i   i   i   i   ij   ik   il   im   in   io   ip   iq   ir   i   i   i   i   i   i   i   i~   is   it   iu   iv   iw   ix   iy   iz   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i{   iA   iB   iC   iD   iE   iF   iG   iH   iI   i   i   i   i   i   i   i}   iJ   iK   iL   iM   iN   iO   iP   iQ   iR   i   i   i   i   i   i   i\   i   iS   iT   iU   iV   iW   iX   iY   iZ   i   i   i   i   i   i   i0   i1   i2   i3   i4   i5   i6   i7   i8   i9   i   i   i   i   i   i   iR	   i   (   i    i   i   i   i   i	   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i
   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i    i   i   i   i   i   i   i   i   i   i[   i.   i<   i(   i+   i!   i&   i   i   i   i   i   i   i   i   i   i]   i$   i*   i)   i;   i^   i-   i/   i   i   i   i   i   i   i   i   i|   i,   i%   i_   i>   i?   i   i   i   i   i   i   i   i   i   i`   i:   i#   i@   i'   i=   i"   i   ia   ib   ic   id   ie   if   ig   ih   ii   i   i   i   i   i   i   i   ij   ik   il   im   in   io   ip   iq   ir   i   i   i   i   i   i   i   i~   is   it   iu   iv   iw   ix   iy   iz   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i   i{   iA   iB   iC   iD   iE   iF   iG   iH   iI   i   i   i   i   i   i   i}   iJ   iK   iL   iM   iN   iO   iP   iQ   iR   i   i   i   i   i   i   i\   i   iS   iT   iU   iV   iW   iX   iY   iZ   i   i   i   i   i   i   i0   i1   i2   i3   i4   i5   i6   i7   i8   i9   i   i   i   i   i   i   (
   t	   __class__t   EBCDIC_TO_ASCII_MAPt   stringt	   maketransR   t   mapt   chrR
   t   ranget	   translate(   R-   R   t   ct   emapR   (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyRz     s.    		                Bt   eurot   20ACs   t    s   t   sbquot   201As   t   fnoft   192s   t   bdquot   201Es   t   hellipt   2026s   t   daggert   2020s   t   Daggert   2021s   t   circt   2C6s   t   permilt   2030s   t   Scaront   160s   t   lsaquot   2039s   t   OEligt   152s   t   ?s   s   #x17Dt   17Ds   s   s   t   lsquot   2018s   t   rsquot   2019s   t   ldquot   201Cs   t   rdquot   201Ds   t   bullt   2022s   t   ndasht   2013s   t   mdasht   2014s   t   tildet   2DCs   t   tradet   2122s   t   scaront   161s   t   rsaquot   203As   t   oeligt   153s   s   s   #x17Et   17Es   t   YumlR	   s   t   EURt   ,t   fs   ,,s   ...t   +s   ++t   ^t   %t   SR    t   OEt   ZR   R   t   *R   s   --t   ~s   (TM)R   R"   t   oet   zt   Ys   t   !s   R   s   t   GBPs   t   $s   t   YENs   t   |s   s   s   ..s   s   s   (th)s   s   <<s   s   s   s   (R)s   s   t   os   s   +-s   t   2s   t   3s   t   acutes   RU   s   t   Ps   s   s   t   1s   s   s   >>s   s   1/4s   s   1/2s   s   3/4s   s   t   As   s   s   s   s   s   t   AEs   t   Cs   t   Es   s   s   s   t   Is   s   s   s   t   Ds   t   Ns   t   Os   s   s   s   s   s   s   t   Us   s   s   s   s   t   bs   t   Bs   t   as   s   s   s   s   s   t   aes   s   Rl   s   s   s   s   t   is   s   s   s   s   t   ns   s   s   s   s   s   t   /s   s   s   s   s   s   t   ys   s   s   s   €i   s   ‚i   s   ƒi   s   „i   s   …i   s   †i   s   ‡i   s   ˆi   s   ‰i   s   Ši   s   ‹i   s   Œi   s   Ži   s   ‘i   s   ’i   s   “i   s   ”i   s   •i   s   –i   s   —i   s   ˜i   s   ™i   s   ši   s   ›i   s   œi   s   ži   s   Ÿi   s    i   s   ¡i   s   ¢i   s   £i   s   ¤i   s   ¥i   s   ¦i   s   §i   s   ¨i   s   ©i   s   ªi   s   «i   s   ¬i   s   ­i   s   ®i   s   ¯i   s   °i   s   ±i   s   ²i   s   ³i   s   ´i   s   µi   s   ¶i   s   ·i   s   ¸i   s   ¹i   s   ºi   s   »i   s   ¼i   s   ½i   s   ¾i   s   ¿i   s   Ài   s   Ái   s   Âi   s   Ãi   s   Äi   s   Åi   s   Æi   s   Çi   s   Èi   s   Éi   s   Êi   s   Ëi   s   Ìi   s   Íi   s   Îi   s   Ïi   s   Ði   s   Ñi   s   Òi   s   Ói   s   Ôi   s   Õi   s   Öi   s   ×i   s   Øi   s   Ùi   s   Úi   s   Ûi   s   Üi   s   Ýi   s   Þi   s   ßi   s   ài   i   s   âi   s   ãi   s   äi   s   åi   s   æi   s   çi   s   èi   s   éi   s   êi   s   ëi   s   ìi   s   íi   s   îi   s   ïi   s   ði   s   ñi   s   òi   s   ói   s   ôi   s   õi   s   öi   s   ÷i   s   øi   s   ùi   s   úi   s   ûi   s   üi   s   ýi   s   þi   i   i   i   i    ii   R@   c         C   s  | j  d d  j   d k r- t d   n  | j   d k rN t d   n  g  } d	 } d	 } x| t |  k  rd| | } t | t  s t |  } n  | |  j k r| |  j k rx |  j	 D]5 \ } }	 }
 | | k r | |	 k r | |
 7} Pq q Wqc | d
 k rW| |  j
 k rW| j | | | ! | j |  j
 |  | d 7} | } qc | d 7} qc W| d	 k ru| S| j | |  d j |  S(   s  Fix characters from one encoding embedded in some other encoding.

        Currently the only situation supported is Windows-1252 (or its
        subset ISO-8859-1), embedded in UTF-8.

        The input must be a bytestring. If you've already converted
        the document to Unicode, you're too late.

        The output is a bytestring in which `embedded_encoding`
        characters have been converted to their `main_encoding`
        equivalents.
        R   R   s   windows-1252t   windows_1252sP   Windows-1252 and ISO-8859-1 are the only currently supported embedded encodings.R@   s   utf-8s4   UTF-8 is the only currently supported main encoding.i    i   i   R	   (   s   windows-1252s   windows_1252(   s   utf8s   utf-8(   R,   Rd   t   NotImplementedErrorRo   RF   t   intt   ordt   FIRST_MULTIBYTE_MARKERt   LAST_MULTIBYTE_MARKERt   MULTIBYTE_MARKERS_AND_SIZESt   WINDOWS_1252_TO_UTF8R   R   (   R&   t   in_bytest   main_encodingt   embedded_encodingt   byte_chunkst   chunk_startt   post   bytet   startt   endt   size(    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt	   detwingle  s<    	


	N(   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   s   #x17DR   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   R   R   (   s   #x17ER   (   R   R	   (   R   R   (   i   i   i   (   i   i   i   (   i   i   i   (   R8   R9   R:   R   Re   R   R=   RW   Ra   RL   Rf   RK   Rc   R   R   Rz   R\   RZ   R   R   R   R   R<   R  (    (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyR>      sh  

	?	B			



	(    (   R:   R   t   htmlentitydefsR    R   RM   R   t   chardet_typeR   R   t   ImportErrorR   t   iconv_codecR   R[   R   R{   R|   t   objectR   R>   (    (    (    s/   /scratch/rashmi/Condor_Script/src/bs4/dammit.pyt   <module>   s0   y