o
    hf                    @   s  d Z ddlZddlmZ ddlZddlZddlZddlZddlm	Z	 ddl
ZddlmZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZmZmZ G d
d de	ZedZedZdZdZ dZ!dZ"dZ#dZ$dZ%dZ&dZ'dZ(dZ)dZ*dZ+dZ,dZ-dZ.dZ/dZ0d Z1e de!d!e"d"e#d#e$d$e%d%e&d&e'd'e(d(e)d)i
Z2e3e2Z4G d*d+ d+ej5Z6G d,d- d-eZ7G d.d/ d/eZ8d0d1 Z9d;d3d4Z:G d5d6 d6eZ;G d7d8 d8eZ<G d9d: d:eZ=dS )<z
Basic data structures
    N)repeat)Enum)StanzaObject)misc_to_space_afterspace_after_to_miscmisc_to_space_beforespace_before_to_misc)decode_from_bioes)tree_reader)CorefMention
CorefChainCorefAttachmentc                   @   s   e Zd ZdZdZdZdS )MWTProcessingTyper         N)__name__
__module____qualname__FLATTENPROCESSSKIP r   r   S/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/common/doc.pyr      s    r   z([0-9]+)-([0-9]+)z.*MWT=Yes.*manual_expansionidtextlemmauposxposfeatsheaddepreldepsmiscner	multi_ner
start_charend_chartype	sentimentconstituencycoref_chainsr   r                     	   c                   @   s   e Zd Zdd ZdS )DocJSONEncoderc                 C   s0   t |tr|jS t |tr| S tj| |S N)
isinstancer   __dict__r   to_jsonjsonJSONEncoderdefault)selfobjr   r   r   r:   7   s
   

zDocJSONEncoder.defaultN)r   r   r   r:   r   r   r   r   r3   6   s    r3   c                   @   s  e Zd ZdZdDddZdd Zedd Zejd	d Zed
d Z	e	jdd Z	edd Z
e
jdd Z
edd Zejdd Zedd Zejdd Zedd Zejdd Zedd Zejdd ZdEddZdd ZdFd!d"ZdFd#d$Z	 	dGd%d&ZdHd'd(Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zed3d4 Zejd5d4 Zd6d7 Zd8d9 Zd:d; Zd<d= Zd>d? Zd@dA Z e!dBdC Z"dS )IDocumentz\ A document class that stores attributes of a document and carries a list of sentences.
    Nc                 C   sZ   g | _ d| _|| _d| _d| _| ||| g | _g | _| jdur+|   | 	  dS dS )a   Construct a document given a list of sentences in the form of lists of CoNLL-U dicts.

        Args:
            sentences: a list of sentences, which being a list of token entry, in the form of a CoNLL-U dict.
            text: the raw text of the document.
            comments: A list of list of strings to use as comments on the sentences, either None or the same length as sentences
        Nr   )

_sentences_lang_text_num_tokens
_num_words_process_sentences_ents_coref
build_entsmark_whitespace)r;   	sentencesr   commentsempty_sentencesr   r   r   __init__B   s   
zDocument.__init__c           	      C   s<  | j D ]"}t|jd d |jdd  D ]\}}| j|j|j }||_qqt| j d d | j dd  D ]\}}|jd }|jd }| j|j|j }||_q5t| j dkrtt| j d jdkrt| j d jd }| j|jd  }||_t| j dkrt| j d jdkr| j d jd }| jd |j }||_d S d S d S )Nr   r   )	r>   ziptokensr@   r'   r&   spaces_afterlenspaces_before)	r;   sentence
prev_token
next_token
whitespaceprev_sentencenext_sentencefinal_tokenfirst_tokenr   r   r   rG   W   s&   
&&

""
zDocument.mark_whitespacec                 C      | j S )z& Access the language of this document r?   r;   r   r   r   langl      zDocument.langc                 C   
   || _ dS )z# Set the language of this document Nr[   r;   valuer   r   r   r]   q      
c                 C   rZ   )z( Access the raw text for this document. r@   r\   r   r   r   r   v   r^   zDocument.textc                 C   r_   )z% Set the raw text for this document. Nrc   r`   r   r   r   r   {   rb   c                 C   rZ   )z1 Access the list of sentences for this document. r>   r\   r   r   r   rH      r^   zDocument.sentencesc                 C   r_   )z+ Set the list of tokens for this document. Nrd   r`   r   r   r   rH      rb   c                 C   rZ   )z0 Access the number of tokens for this document. rA   r\   r   r   r   
num_tokens   r^   zDocument.num_tokensc                 C   r_   )z- Set the number of tokens for this document. Nre   r`   r   r   r   rf      rb   c                 C   rZ   )z/ Access the number of words for this document. rB   r\   r   r   r   	num_words   r^   zDocument.num_wordsc                 C   r_   )z, Set the number of words for this document. Nrg   r`   r   r   r   rh      rb   c                 C   rZ   )z/ Access the list of entities in this document. rD   r\   r   r   r   ents   r^   zDocument.entsc                 C   r_   z, Set the list of entities in this document. Nri   r`   r   r   r   rj      rb   c                 C   rZ   z? Access the list of entities. This is just an alias of `ents`. ri   r\   r   r   r   entities   r^   zDocument.entitiesc                 C   r_   rk   ri   r`   r   r   r   rm      rb   c                 C   s   g | _ |d u rtg }tt||D ]o\}\}}z	t|| |d}W n3 ty4 } ztd| |d }~w tyT } zdd |D }d|}td||f |d }~ww | j | |j	d j
|j	d j}	}
t| jd u|	d u|
d ufr~| j|	|
 |_||_q|   |sd	d | j D }nd
d |D }t| j |D ]o\}}|jrtdd |D s|dd|j   n)|js|D ]#}|ds|ds|ds|dr|ddd  |_ nq|D ]}|| q|D ]}|dr|ddd  |_ nqt|j|_qd S )N)docempty_wordsz)Could not process document at sentence %dc                 S   s   g | ]}d | qS )z|%s|r   ).0tr   r   r   
<listcomp>       z/Document._process_sentences.<locals>.<listcomp>, z:Could not process document at sentence %d
  Raw tokens: %sr   rL   c                 S   s   g | ]}g qS r   r   rp   xr   r   r   rr          c                 S      g | ]}t |qS r   )listru   r   r   r   rr      rs   c                 s   s8    | ]}| d p| dp| dp| dV  qdS )# text #text # text=#text=N
startswithrp   commentr   r   r   	<genexpr>   s   6 z.Document._process_sentences.<locals>.<genexpr>z	# text =  rz   r{   r|   r}   =r   z	# sent_id)rH   r   	enumeraterM   Sentence
IndexError
ValueErrorjoinappendrN   r&   r'   allr   index_count_wordsanysplitr   stripadd_commentsent_idstr)r;   rH   rI   rJ   sent_idxrN   ro   rR   e	begin_idxend_idxsentence_commentsr   r   r   r   rC      sT   
,(zDocument._process_sentencesc                 C   s0   t dd | jD | _t dd | jD | _dS )z6
        Count the number of tokens and words
        c                 S      g | ]}t |jqS r   )rP   rN   rp   rR   r   r   r   rr          z)Document._count_words.<locals>.<listcomp>c                 S   r   r   rP   wordsr   r   r   r   rr      r   N)sumrH   rf   rh   r\   r   r   r   r      s   zDocument._count_wordsFc                    s   t |tr|g}t |tsJ dt|dksJ dg }| jD ];}g }|r*|j}n|j}|D ] t|dkrB|t |d g7 }q/| fdd|D g7 }q/|rW|| q ||7 }q |S )aI   Get fields from a list of field names.
        If only one field name (string or singleton list) is provided,
        return a list of that field; if more than one, return a list of list.
        Note that all returned fields are after multi-word expansion.

        Args:
            fields: name of the fields as a list or a single string
            as_sentences: if True, return the fields as a list of sentences; otherwise as a whole list
            from_token: if True, get the fields from Token; otherwise from Word

        Returns:
            All requested fields.
        #Must provide field names as a list.r   Must have at least one field.r   c                       g | ]}t  |qS r   getattr)rp   fieldunitr   r   rr     r   z Document.get.<locals>.<listcomp>)	r5   r   ry   rP   rH   rN   r   r   r   )r;   fieldsas_sentences
from_tokenresultsrR   cursentunitsr   r   r   get   s$   


zDocument.getc                 C   s|  t |tr|g}t |ttfsJ dt |ttfsJ dt|dks(J d|r0|r0J d|rkt| jt|ks?J dt| j|D ]#\}}t|dkrXt||d | qEt||D ]
\}}t||| q]qEd	S |rt| jt|ks| j	t|ksJ dd}	| jD ]7}|r|j
}
n|j}
|
D ])}t|dkrt||d ||	  nt|||	 D ]
\}}t||| q|	d7 }	qqd	S )
a  Set fields based on contents. If only one field (string or
        singleton list) is provided, then a list of content will be
        expected; otherwise a list of list of contents will be expected.

        Args:
            fields: name of the fields as a list or a single string
            contents: field values to set; total length should be equal to number of words/tokens
            to_token: if True, set field values to tokens; otherwise to words

        r   z4Must provide contents as a list (one item per line).r   r   zBBoth to_token and to_sentence set to True, which is very confusingz3Contents must have the same length as the sentencesr   z8Contents must have the same length as the original file.N)r5   r   tuplery   rP   rH   rM   setattrrf   rh   rN   r   )r;   r   contentsto_tokento_sentencerR   contentr   piececidxr   r   r   r   r   set  sB   
"

zDocument.setc                    s  d | j D ]_}d}|jD ]}|d7 }t|jdk}|jdur%t|jnd}|j}	tj	}
|r5|	r5tj
}
n|dkr?|r?tj
}
n|dkrI|	rItj}
n|dkrT|sQ|rTtj
}
|
tj	krp|jD ]}|f|_d|_d\|_|_||_q\q|
tj
krdd |  dD }t|dkr|jg} d7  |t| d }|jr|jd	krdnd
dd |jd
D |_t|dkr||fn|f|_g |_t|D ]\}}|jt|t|| t|i q|}q|
tjkrt fdd|jD |_|jD ]	}| j 7  _q|jd }d|_qg |_|jD ]M}||_|jD ]}||_||_|j| q|jdurW|jdurWddd |jD |jkrW|j}|jD ]}|t|j }||_||_|}qDq|ra|  q|  q|    t|ks{J d  t|dS )a   Extend the multi-word tokens annotated by tokenizer. A list of list of expansions
        will be expected for each multi-word token. Use `process_manual_expanded` to limit
        processing for tokens marked manually expanded:

        There are two types of MWT expansions: those with `misc`: `MWT=True`, and those with
        `manual_expansion`: True. The latter of which means that it is an expansion which the
        user manually specified through a postprocessor; the former means that it is a MWT
        which the detector picked out, but needs to be automatically expanded.

        process_manual_expanded = None - default; doesn't process manually expanded tokens
                                = True - process only manually expanded tokens (with `manual_expansion`: True)
                                = False - process only tokens explicitly tagged as MWT (`misc`: `MWT=True`)
        r   r   NFNNc                 S   s   g | ]
}t |d kr|qS r   )rP   ru   r   r   r   rr   p      z/Document.set_mwt_expansions.<locals>.<listcomp>r   MWT=Yes|c                 S   s   g | ]}|d kr|qS )r   r   ru   r   r   r   rr   z  s    c                 3   s    | ]}|  V  qd S r4   r   )rp   orig_ididx_er   r   r         z.Document.set_mwt_expansions.<locals>.<genexpr>rL    c                 s   s    | ]}|j V  qd S r4   r   rp   wordr   r   r   r     s    z{} {})!rH   rN   rP   r   r#   multi_word_token_miscmatchr   r   r   r   r   r   r"   r    r!   r   r   r   r   r   WordIDTEXTr   sentparentr&   r'   build_fake_dependenciesrebuild_dependenciesr   format)r;   
expansionsfake_dependenciesprocess_manual_expandedrR   idx_wtokenis_multiis_mwtis_manual_expansionperform_mwt_processingr   expanded	idx_w_endie_wordr&   r'   r   r   r   set_mwt_expansionsC  s   



, 



6


"zDocument.set_mwt_expansionsc           
      C   s   g }| j D ]:}|jD ]4}t|jdk}|jdurt|jnd}|j}|r'|r)|r>|j}d	dd |j
D }	|||	g q
q|rIdd |D }|S )a1   Get the multi-word tokens. For training, return a list of
        (multi-word token, extended multi-word token); otherwise, return a list of
        multi-word token only. By default doesn't skip already expanded tokens, but
        `skip_already_expanded` will return only tokens marked as MWT.
        r   Nr   c                 S      g | ]}|j qS r   r   r   r   r   r   rr         z/Document.get_mwt_expansions.<locals>.<listcomp>c                 S   s   g | ]}|d  qS r   r   )rp   r   r   r   r   rr     rs   )rH   rN   rP   r   r#   r   r   r   r   r   r   r   )
r;   
evaluationr   rR   r   r   r   r   srcdstr   r   r   get_mwt_expansions  s   

zDocument.get_mwt_expansionsc                 C   s.   g | _ | jD ]}| }|  j |7  _ q| j S )zX Build the list of entities by iterating over all words. Return all entities as a list. )rj   rH   rF   )r;   ss_entsr   r   r   rF     s
   
zDocument.build_entsc                 C   sD   | j D ]}|jD ]}|jsq|jd}t|}d||_qqdS )zS Sort the features on all the words... useful for prototype treebanks, for example r   N)rH   r   r   r   sortedr   )r;   rR   r   piecesr   r   r   sort_features  s   

zDocument.sort_featuresc                 c       | j D ]}|jE dH  qdS )z= An iterator that returns all of the words in this Document. N)rH   r   r;   r   r   r   r   
iter_words     
zDocument.iter_wordsc                 c   r   )z> An iterator that returns all of the tokens in this Document. N)rH   rN   r   r   r   r   iter_tokens  r   zDocument.iter_tokensc                 C      dd | j D S )z6 Returns a list of list of comments for the sentences c                 S   s   g | ]
}d d |j D qS )c                 S   s   g | ]}|qS r   r   r   r   r   r   rr     rw   z9Document.sentence_comments.<locals>.<listcomp>.<listcomp>)rI   r   r   r   r   rr     r   z.Document.sentence_comments.<locals>.<listcomp>rH   r\   r   r   r   r     s   zDocument.sentence_commentsc                 C   rZ   )z8
        Access the coref lists of the document
        )rE   r\   r   r   r   coref     zDocument.corefc                 C   s   || _ | | dS )z  Set the document's coref lists N)rE   _attach_coref_mentions)r;   chainsr   r   r   r     s   c                 C   s   | j D ]}|jD ]}g |_qq|D ]=}t|jD ]5\}}| j |j }t|j|jD ]#}||jk}||jd k}	||j	k}
t
|||	|
}|j| j| q)qqd S Nr   )rH   r   r+   r   mentionsrR   range
start_wordend_wordrepresentative_indexr   r   )r;   r   rR   r   chainmention_idxmentionword_idxis_startis_endis_representative
attachmentr   r   r   r     s    



zDocument._attach_coref_mentionsc                 C   s4   t t||t| j | jD ]	\}}t||_qd S r4   )rM   r   rP   rH   r   r   )r;   start_indexr   rR   r   r   r   reindex_sentences  s   $zDocument.reindex_sentencesc                 C   r   )zp Dumps the whole document into a list of list of dictionary for each token in each sentence in the doc.
        c                 S      g | ]}|  qS r   )to_dictr   r   r   r   rr     rs   z$Document.to_dict.<locals>.<listcomp>r   r\   r   r   r   r    s   zDocument.to_dictc                 C      t j|  ddtdS Nr   F)indentensure_asciiclsr8   dumpsr  r3   r\   r   r   r   __repr__     zDocument.__repr__c                 C   sD   |dkrd dd | jD S |dkrd dd | jD S t| S )Ncz

c                 s       | ]}d  |V  qdS )z{:c}Nr   rp   r   r   r   r   r         z&Document.__format__.<locals>.<genexpr>Cc                 s   r  )z{:C}Nr  r  r   r   r   r     r  )r   rH   r   r;   specr   r   r   
__format__  s
   zDocument.__format__c                 C   s   t | j|  |  fS )z Dumps the whole document including text to a byte array containing a list of list of dictionaries for each token in each sentence in the doc.
        )pickler
  r   r  r   r\   r   r   r   to_serialized  s   zDocument.to_serializedc                 C   sd   t |}t|tstdt|dkr"t |\}}| ||}|S t |\}}}| |||}|S )zu Create and initialize a new document from a serialized string generated by Document.to_serialized_string():
        z8Serialized data was not a tuple when building a Documentr   )r  loadsr5   r   	TypeErrorrP   )r  serialized_stringstuffr   rH   rn   rI   r   r   r   from_serialized  s   


zDocument.from_serialized)NNNr   )FF)FN)F)#r   r   r   __doc__rK   rG   propertyr]   setterr   rH   rf   rh   rj   rm   rC   r   r   r   r   r   rF   r   r   r   r   r   r   r  r  r  r  r  classmethodr  r   r   r   r   r=   >   sr    















3

(0

]


r=   c                   @   s  e Zd ZdZdQddZdd Zdd Zed	d
 Zej	dd
 Zedd Z
e
j	dd Z
edd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	d d Zed!d" Zej	d#d" Zed$d% Zej	d&d% Zed'd( Zej	d)d( Zed*d+ Zej	d,d+ Zd-d. Zed/d0 Zej	d1d0 Zed2d3 Zej	d4d3 Zed5d6 Zd7d8 Zd9d: Zd;d< Zd=d> ZdRd?d@ZdAdB ZdRdCdDZdEdF Z dRdGdHZ!dIdJ Z"dKdL Z#dMdN Z$dOdP Z%dS )Sr   zY A sentence class that stores attributes of a sentence and carries a list of tokens.
    Nc                    sv   g  _ g  _g  _d _g  _| _d _d _g  _d _	d _
 | |dur6 fdd|D  _dS g  _dS )zS Construct a sentence given a list of tokens in the form of CoNLL-U dicts.
        Nc                    r   r   )r   )rp   entryr\   r   r   rr   4  r   z%Sentence.__init__.<locals>.<listcomp>)_tokens_words_dependenciesr@   rD   _doc_constituency
_sentiment	_comments_doc_id_enhanced_dependencies_process_tokens_empty_words)r;   rN   rn   ro   r   r\   r   rK     s   

zSentence.__init__c           	      C   sL  d\}}g g | _ | _t|D ]\}}t|vr|d f|t< t|t tr,|t f|t< t|tdkrE|t \}}| j t	| | qt
| |}t| jdkrh| jd j|jkrh|| jd< || j d jd< q| j| |td }||kr| j d j| n| j t	| ||gd | j d |_q| j D ]}|  q|   d S )N)rL   rL   r   r   rL   r   )rN   r   r   r   r5   intrP   r   r   Tokenr   r   r   consolidate_whitespacer   )	r;   rN   stenr   r!  new_wordidxr   r   r   r   r+  8  s0   
 


zSentence._process_tokensc                 C   s   | j duot| j dkS )zT
        Whether or not the enhanced dependencies are part of this sentence
        Nr   )r*  rP   r\   r   r   r   has_enhanced_dependencies]  s   z"Sentence.has_enhanced_dependenciesc                 C   rZ   )
        Access the index of this sentence within the doc.

        If multiple docs were processed together,
        the sentence index will continue counting across docs.
        _indexr\   r   r   r   r   c  s   zSentence.indexc                 C   r_   )! Set the sentence's index value. Nr7  r`   r   r   r   r   m  rb   c                 C   s   t jddd | jS )r6  DUse of sentence.id is deprecated.  Please use sentence.index insteadr   
stacklevelwarningswarnr8  r\   r   r   r   r   r  s   zSentence.idc                 C   s   t jddd || _dS )r9  r:  r   r;  Nr=  r`   r   r   r   r   }  s   
c                 C   rZ   )z8 conll-style sent_id  Will be set from index if unknown )_sent_idr\   r   r   r   r     r^   zSentence.sent_idc                 C   P   || _ dt| }t| jD ]\}}|dr|| j|<  dS q| j| dS )z# Set the sentence's sent_id value. z# sent_id = N)r@  r   r   r(  r   r   )r;   ra   sent_id_commentcomment_idxr   r   r   r   r        

c                 C   rZ   )z2 conll-style doc_id  Can be left blank if unknown )r)  r\   r   r   r   doc_id  r^   zSentence.doc_idc                 C   rA  )z" Set the sentence's doc_id value. z# doc_id = N)r)  r   r   r(  r   r   )r;   ra   doc_id_commentrC  r   r   r   r   rE    rD  c                 C   rZ   z% Access the parent doc of this span. r%  r\   r   r   r   rn     r^   zSentence.docc                 C   r_   z" Set the parent doc of this span. NrH  r`   r   r   r   rn     rb   c                 C   rZ   )z( Access the raw text for this sentence. rc   r\   r   r   r   r     r^   zSentence.textc                 C   r_   )z% Set the raw text for this sentence. Nrc   r`   r   r   r   r     rb   c                 C   rZ   )z0 Access list of dependencies for this sentence. r$  r\   r   r   r   dependencies  r^   zSentence.dependenciesc                 C   r_   )z1 Set the list of dependencies for this sentence. NrJ  r`   r   r   r   rK    rb   c                 C   rZ   )z. Access the list of tokens for this sentence. r"  r\   r   r   r   rN     r^   zSentence.tokensc                 C   r_   )z+ Set the list of tokens for this sentence. NrL  r`   r   r   r   rN     rb   c                 C   rZ   z- Access the list of words for this sentence. r#  r\   r   r   r   r     r^   zSentence.wordsc                 C   r_   z* Set the list of words for this sentence. NrN  r`   r   r   r   r     rb   c                 C   rZ   rM  r,  r\   r   r   r   ro     r^   zSentence.empty_wordsc                 C   r_   rO  rP  r`   r   r   r   ro     rb   c                 C   rZ   )z/ Access the list of entities in this sentence. ri   r\   r   r   r   rj     r^   zSentence.entsc                 C   r_   z, Set the list of entities in this sentence. Nri   r`   r   r   r   rj     rb   c                 C   rZ   rl   ri   r\   r   r   r   rm     r^   zSentence.entitiesc                 C   r_   rQ  ri   r`   r   r   r   rm     rb   c              	   C   sf   g | _ dd | jD }t|}|D ]}| j|d |d d  }| j t||d | j| d q| j S )a   Build the list of entities by iterating over all tokens. Return all entities as a list.

        Note that unlike other attributes, since NER requires raw text, the actual tagging are always
        performed at and attached to the `Token`s, instead of `Word`s.
        c                 S   r   r   )r$   )rp   wr   r   r   rr     r   z'Sentence.build_ents.<locals>.<listcomp>startendr   r(   )rN   r(   rn   r   )rj   rN   r	   r   Spanrn   )r;   tagsdecodedr   
ent_tokensr   r   r   rF     s    zSentence.build_entsc                 C   rZ   )z/ Returns the sentiment value for this sentence )r'  r\   r   r   r   r)     r^   zSentence.sentimentc                 C   rA  )z Set the sentiment value z# sentiment = N)r'  r   r   r(  r   r   )r;   ra   sentiment_commentrC  r   r   r   r   r)     rD  c                 C   rZ   )z1 Returns the constituency tree for this sentence )r&  r\   r   r   r   r*     r^   zSentence.constituencyc                 C   sd   || _ dt| }|dddd}t| jD ]\}}|dr)|| j|<  dS q| j| dS )z
        Set the constituency tree

        This incidentally updates the #constituency comment if it already exists,
        or otherwise creates a new comment # constituency = ...
        z# constituency = 
z*NL*r   N)r&  r   replacer   r(  r   r   )r;   ra   constituency_commentrC  r   r   r   r   r*     s   

c                 C   rZ   )z0 Returns CoNLL-style comments for this sentence )r(  r\   r   r   r   rI   ,  r^   zSentence.commentsc                 C   s2  | ds	d| }| dr6|dd\}}t|}t|dkr'td| |d | _dd	 | jD | _n[| d
rV|dd\}}t|	 }|| _
dd	 | jD | _n;| drt|dd\}}|	 }|| _dd	 | jD | _n| dr|dd\}}|	 }|| _dd	 | jD | _| j| dS )z Adds a single comment to this sentence.

        If the comment does not already have # at the start, it will be added.
        #z# # constituency =r   r   z0Multiple constituency trees for one sentence: %sr   c                 S      g | ]	}| d s|qS )r_  r~   ru   r   r   r   rr   >      z(Sentence.add_comment.<locals>.<listcomp># sentiment =c                 S   r`  )rb  r~   ru   r   r   r   rr   C  ra  # sent_id =c                 S   r`  )rc  r~   ru   r   r   r   rr   H  ra  
# doc_id =c                 S   r`  )rd  r~   ru   r   r   r   rr   M  ra  N)r   r   r
   
read_treesrP   r   r&  r(  r.  r   r'  r@  r)  r   )r;   r   _	tree_texttreer)   r   rE  r   r   r   r   1  s2   






zSentence.add_commentc                 C   sZ   t dd | jD }t| jt| jkot| j| jd jk}|r)|r+|   d S d S d S )Nc                 s   s$    | ]}|j d uo|jd uV  qd S r4   )r    r!   r   r   r   r   r   R     " z0Sentence.rebuild_dependencies.<locals>.<genexpr>rL   )r   r   rP   rN   r   build_dependencies)r;   is_complete_dependenciesis_complete_wordsr   r   r   r   P  s   *zSentence.rebuild_dependenciesc                 C   s   g | _ | jD ]I}|jdkrtdtdi}t| |}n,z
| j|jd  }W n ty: } ztd|j|j|d}~ww |j|jkrEt	d| j 
||j|f qdS )z~ Build the dependency graph for this sentence. Each dependency graph entry is
        a list of (head, deprel, word).
        r   ROOTr   z2Word head {} is not a valid word index for word {}Nz*Dependency tree is incorrectly constructed)rK  r   r    r   r   r   r   r   r   r   r   r!   )r;   r   
word_entryr    r   r   r   r   rj  V  s   

zSentence.build_dependenciesc                 C   s\   g | _ t| jD ]#\}}||_|dkrdnd|_d|j|jf |_| j ||j|f qd S )Nr   rootdep%d:%s)rK  r   r   r    r!   r"   r   )r;   r   r   r   r   r   r   j  s   z Sentence.build_fake_dependenciesc                 C   s2   | j D ]}t|d j|d j|d f|d qdS )z+ Print the dependencies for this sentence. r   r   r   fileN)rK  printr   r   )r;   rs  dep_edger   r   r   print_dependenciesr  s   
$zSentence.print_dependenciesc                 C       t  }| j|d |  S )z6 Dump the dependencies for this sentence into string. rr  )ioStringIOrv  getvaluer   )r;   
dep_stringr   r   r   dependencies_stringw     zSentence.dependencies_stringc                 C       | j D ]
}t| |d qdS )z% Print the tokens for this sentence. rr  N)rN   rt  pretty_print)r;   rs  tokr   r   r   print_tokens}     
zSentence.print_tokensc                 C   rw  )z0 Dump the tokens for this sentence into string. rr  )rx  ry  r  rz  r   )r;   toks_stringr   r   r   tokens_string  r}  zSentence.tokens_stringc                 C   r~  )z$ Print the words for this sentence. rr  N)r   rt  r  )r;   rs  r   r   r   r   print_words  r  zSentence.print_wordsc                 C   rw  )z/ Dump the words for this sentence into string. rr  )rx  ry  r  rz  r   )r;   wrds_stringr   r   r   words_string  r}  zSentence.words_stringc                 C   s   g }d}t | jD ]@\}}|t| jk rC| j| jd |jd k rC|| j|   |d7 }|t| jk rC| j| jd |jd k s!|| 7 }q	| j|d D ]	}||  qQ|S )zV Dumps the sentence into a list of dictionary for each token in the sentence.
        r   r   N)r   rN   rP   r,  r   r   r  )r;   ret	empty_idx	token_idxr   
empty_wordr   r   r   r    s   ((zSentence.to_dictc                 C   r  r  r	  r\   r   r   r   r    r  zSentence.__repr__c           	      C   s"  |dkr|dkrt | S g }d}t| jD ]A\}}|t| jk rO| j| jd |jd k rO|| j|   |d7 }|t| jk rO| j| jd |jd k s-||  q| j|d  D ]	}||  q^|dkrqd|S |dkrd|}t| j	dkrd| j	}|d | S |S d S )Nr  r  r   r   rZ  )
r   r   rN   rP   r,  r   r   to_conll_textr   rI   )	r;   r  r   r  r  r   r  rN   r   r   r   r   r    s*   ((

zSentence.__format__r   r4   )&r   r   r   r  rK   r+  r5  r  r   r  r   r   rE  rn   r   rK  rN   r   ro   rj   rm   rF   r)   r*   rI   r   r   rj  r   rv  r|  r  r  r  r  r  r  r  r   r   r   r   r     s    
%
	































r   c                 C   s   g }| j dD ]7}|dd}t|dkr:|\}}|ttfv r$t|}d| }t| |r5t| || q|tkr:q|	| qd
|| _ dS )zCreate attributes by parsing from the `misc` field.

    Also, remove start_char, end_char, and any other values we can set
    from the misc field if applicable, so that we don't repeat ourselves
    r   r   r   r   rf  N)_miscr   rP   
START_CHAREND_CHARr.  hasattrr   NERr   r   )r   remaining_valuesitem	key_valuekeyra   attrr   r   r   init_from_misc  s   

r  -c           
   	   C   s  dd t tD }g }| D ]}|tks|tkr#|d|| |  q|tkr3|d|| |  q|tkr| | }t|dkr~g }|D ],}|j	rP|j
rPd}n|j	rVd}n|j
r\d}nd}|jrcd	nd
}	|d||	|jjf  qE|d|d| q|tkr| | r|| |  q|tkrt| | tr|dd | | D nt| | |t| < q|tv rt| | |t| < q|rd||tt < nd|tt < d|tt  vrd|tt  vrt| vrttt| t tr| t n| t d d |tt < d|S )Nc                 S   s   g | ]}d qS )rf  r   )rp   r   r   r   r   rr     rw   z&dict_to_conll_text.<locals>.<listcomp>{}={}r   zunit-zstart-zend-zmiddle-zrepr-r   z%s%sid%d,c                 S   rx   r   r   ru   r   r   r   rr     rs   r   rf  r  .r   	)r   	FIELD_NUMr  r  r   r   r  COREF_CHAINSrP   r   r   r   r   r   r   MISCr   r5   r   r   FIELD_TO_IDXHEADr.  )

token_dictid_connectortoken_conllr#   r  r   misc_chainsr   coref_positionr   r   r   r   dict_to_conll_text  sL   <.6
r  c                   @   s  e Zd ZdZd7ddZedd Zejdd Zedd	 Zejd
d	 Zedd Z	e	jdd Z	edd Z
e
jdd Z
dd Zedd Zejdd Zedd Zejdd Zedd Zejdd Zedd Zedd Zed d! Zejd"d! Zed#d$ Zejd%d$ Zed&d' Zejd(d' Zd)d* Zd+d, Zd-d. Zeeeeeeeegfd/d0Zd1d2 Z d3d4 Z!d5d6 Z"dS )8r/  a^   A token class that stores attributes of a token and carries a list of words. A token corresponds to a unit in the raw
    text. In some languages such as English, a token has a one-to-one mapping to a word, while in other languages such as French,
    a (multi-word) token might be expanded into multiple words that carry syntactic annotations.
    Nc                 C   s   | t| _| t| _| jstd| jstd| td| _| td| _	| t
d| _|dur5|ng | _| td| _| td| _|| _| td| _d| _d| _| jdurat|  dS dS )z
        Construct a token given a dictionary format token entry. Optionally link itself to the corresponding words.
        The owning sentence must be passed in.
        zid not included for the tokenztext not included for the tokenNr   r   )r   r   _idr   r@   r   r  r  r  _ner	MULTI_NER
_multi_nerr#  r  _start_charr  	_end_char_sentMEXP_mexp_spaces_before_spaces_afterr  )r;   rR   token_entryr   r   r   r   rK   	  s&   
zToken.__init__c                 C   rZ   )z! Access the index of this token. r  r\   r   r   r   r   "  r^   zToken.idc                 C   r_   )z Set the token's id value. Nr  r`   r   r   r   r   '  rb   c                 C   rZ   z6 Access the whether this token was manually expanded. r  r\   r   r   r   r   ,  r^   zToken.manual_expansionc                 C   r_   z3 Set the whether this token was manually expanded. Nr  r`   r   r   r   r   1  rb   c                 C   rZ   )z/ Access the text of this token. Example: 'The' rc   r\   r   r   r   r   6  r^   z
Token.textc                 C   r_   )z, Set the token's text value. Example: 'The' Nrc   r`   r   r   r   r   ;  rb   c                 C   rZ   )z- Access the miscellaneousness of this token. r  r\   r   r   r   r#   @  r^   z
Token.miscc                 C   "   |  |dkr|| _dS d| _dS )z* Set the token's miscellaneousness value. FN_is_nullr  r`   r   r   r   r#   E     "c                 C   s  d}d}t | j}t| jD ]r\}}|j}|sq|d}|dkr2tdd |D r1t|| _d}ntdd |D r@t	d ||d	 krWtd
d |D rVt
|| _d}ntdd |D rst
|}|dkrnt	d nt	d dd |D }d||_q| j}|r|d}tdd |D rt|}	|r|	| jkrt	d n|	| _tdd |D rt
|}
|r|
| jkrt	d n|
| _dd |D }d|| _dS dS )zi
        Remove whitespace misc annotations from the Words and mark the whitespace on the Tokens
        Fr   r   c                 s       | ]}| d V  qdS SpacesBefore=Nr~   rp   r   r   r   r   r   W  r  z/Token.consolidate_whitespace.<locals>.<genexpr>Tc                 s   r  r  r~   r  r   r   r   r   [  r  zUFound a SpacesBefore MISC annotation on a Word that was not the first Word in a Tokenr   c                 s   $    | ]}| d p| dV  qdS SpaceAfter=SpacesAfter=Nr~   r  r   r   r   r   ^  ri  c                 s   r  r  r~   r  r   r   r   r   b  ri  r   zEUnexpected SpaceAfter=No annotation on a word in the middle of an MWTz8Unexpected SpacesAfter on a word in the middle on an MWTc                 S   .   g | ]}| d s| ds| ds|qS r  r  r  r~   ru   r   r   r   rr   h     . z0Token.consolidate_whitespace.<locals>.<listcomp>c                 s   r  r  r~   r  r   r   r   r   n  r  z7Found conflicting SpacesBefore on a token and its word!c                 s   r  r  r~   r  r   r   r   r   u  ri  zCFound conflicting SpaceAfter / SpacesAfter on a token and its word!c                 S   r  r  r~   ru   r   r   r   rr   |  r  N)rP   r   r   r#   r   r   r   rQ   r>  r?  r   rO   r   )r;   found_afterfound_beforerh   r   r   r#   r   unexpected_space_afterrQ   rO   r   r   r   r0  J  s^   










zToken.consolidate_whitespacec                 C   rZ   )z= SpacesBefore for the token. Translated from the MISC fields r  r\   r   r   r   rQ     r^   zToken.spaces_beforec                 C   
   || _ d S r4   r  r`   r   r   r   rQ        
c                 C   rZ   )zJ SpaceAfter or SpacesAfter for the token.  Translated from the MISC field r  r\   r   r   r   rO     r^   zToken.spaces_afterc                 C   r  r4   r  r`   r   r   r   rO     r  c                 C   rZ   )z; Access the list of syntactic words underlying this token. rN  r\   r   r   r   r     r^   zToken.wordsc                 C   s   || _ | j D ]}| |_qdS )z6 Set this token's list of underlying syntactic words. N)r#  r   )r;   ra   rR  r   r   r   r     s   
c                 C   rZ   zB Access the start character index for this token in the raw text. r  r\   r   r   r   r&     r^   zToken.start_charc                 C   rZ   z@ Access the end character index for this token in the raw text. r  r\   r   r   r   r'     r^   zToken.end_charc                 C   rZ   )z3 Access the NER tag of this token. Example: 'B-ORG')r  r\   r   r   r   r$     r^   z	Token.nerc                 C   r  )z* Set the token's NER tag. Example: 'B-ORG'FN)r  r  r`   r   r   r   r$     r  c                 C   rZ   )zF Access the MULTI_NER tag of this token. Example: '(B-ORG, B-DISEASE)')r  r\   r   r   r   r%     r^   zToken.multi_nerc                 C   r  )z= Set the token's MULTI_NER tag. Example: '(B-ORG, B-DISEASE)'FN)r  r  r`   r   r   r   r%     r  c                 C   rZ   )z@ Access the pointer to the sentence that this token belongs to. r  r\   r   r   r   r     r^   z
Token.sentc                 C   r_   )z= Set the pointer to the sentence that this token belongs to. Nr  r`   r   r   r   r     rb   c                 C   r  r  r	  r\   r   r   r   r    r  zToken.__repr__c                 C   s.   |dkrd |  S |dkr|  S t| S )Nr  rZ  P)r   r  r  r   r  r   r   r   r    s
   zToken.__format__c                 C   s   d dd |  D S )NrZ  c                 s   s    | ]}t |V  qd S r4   )r  ru   r   r   r   r     r   z&Token.to_conll_text.<locals>.<genexpr>)r   r  r\   r   r   r   r    s   zToken.to_conll_textc           
      C   s  g }t | jdkrmi }|D ]}t| |durt| |||< qt|v rh| j}|durE|dkrEt|}|trA|t d | |t< n||t< | j}|durh|dkrht|}|trd|t d | |t< n||t< |	| | j
D ]}| }	t | jdkrt|v rt| tdurt| t|	t< t | jdkrt|v rt| tdurt| t|	t< t | jdkrt|v r| j}|dur|dkrt|}|	tr|	t d | |	t< n||	t< | j}|dur|dkrt|}|	tr|	t d | |	t< n||	t< |	|	 qp|S )z Dumps the token into a list of dictionary for this token with its extended words
        if the token is a multi-word token.
        r   Nr   r   r   )rP   r   r   r  rO   r   r   rQ   r   r   r   r  r  r  )
r;   r   r  r  r   rO   
space_miscrQ   r   	word_dictr   r   r   r    sT   



$$

zToken.to_dictc              	   C   s@   d| j j dddd | jD  dddd | jD  d	S )
z7 Print this token with its extended words in one line. <z id=r  c                 S   rx   r   r  ru   r   r   r   rr   
  rs   z&Token.pretty_print.<locals>.<listcomp>z;words=[rt   c                 S   r  r   )r  r   r   r   r   rr   
  rs   z]>)	__class__r   r   r   r   r\   r   r   r   r    s   @zToken.pretty_printc                 C      |d u p|dkS Nrf  r   r`   r   r   r   r       zToken._is_nullc                 C   s   t | jdkS r   r   r\   r   r   r   r     s   zToken.is_mwtr4   )#r   r   r   r  rK   r  r   r  r   r   r#   r0  rQ   rO   r   r&   r'   r$   r%   r   r  r  r  r   r   r  r  r  r  r  r  r  r  r  r   r   r   r   r   r/    sn    








5













5r/  c                   @   s"  e Zd ZdZdd Zedd Zejdd Zedd Zejd	d Zed
d Z	e	jdd Z	edd Z
e
jdd Z
edd Zejdd Zedd Zejdd Zedd Zejdd Zedd Zejdd Zedd Zejdd Zedd  Zejd!d  Zed"d# Zejd$d# Zed%d& Zejd'd& Zed(d) Zejd*d) Zed+d, Zejd-d, Zed.d/ Zejd0d/ Zed1d2 Zejd3d2 Zed4d5 Zejd6d5 Zd7d8 Zd9d: Zd;d< Zeeeeee e!e"e#e$e%e&e'e(gfd=d>Z)d?d@ Z*dAdB Z+dCS )Dr   z4 A word class that stores attributes of a word.
    c                 C   s&  | td| _t| jtrt| jdkr| jd | _| td| _| jdur+| jdus2J d|| t	d| _
| td| _| td| _| td| _| td| _| td| _| td| _| td| _| td| _d| _|| _| td| _d| _| jdurt |  | t!d| _"dS )z@ Construct a word given a dictionary format word entry.
        Nr   r   z/id and text should be included for the word. {})#r   r   r  r5   r   rP   r   r@   r   LEMMA_lemmaUPOS_uposXPOS_xposFEATS_featsr  _headDEPREL_deprelr  r  r  r  r  r  _parentr  r  r  _coref_chainsr  DEPSr"   )r;   rR   rn  r   r   r   rK     s,   "
zWord.__init__c                 C   rZ   r  r  r\   r   r   r   r   6  r^   zWord.manual_expansionc                 C   r_   r  r  r`   r   r   r   r   ;  rb   c                 C   rZ   )z  Access the index of this word. r  r\   r   r   r   r   @  r^   zWord.idc                 C   r_   )z Set the word's index value. Nr  r`   r   r   r   r   E  rb   c                 C   rZ   )z- Access the text of this word. Example: 'The'rc   r\   r   r   r   r   J  r^   z	Word.textc                 C   r_   )z* Set the word's text value. Example: 'The'Nrc   r`   r   r   r   r   O  rb   c                 C   rZ   )z  Access the lemma of this word. )r  r\   r   r   r   r   T  r^   z
Word.lemmac                 C   s,   |  |dks| jdkr|| _dS d| _dS )z Set the word's lemma value. Frf  N)r  r@   r  r`   r   r   r   r   Y  s   ,c                 C   rZ   zB Access the universal part-of-speech of this word. Example: 'NOUN'r  r\   r   r   r   r   ^  r^   z	Word.uposc                 C   r  z? Set the word's universal part-of-speech value. Example: 'NOUN'FNr  r  r`   r   r   r   r   c  r  c                 C   rZ   )zI Access the treebank-specific part-of-speech of this word. Example: 'NNP')r  r\   r   r   r   r   h  r^   z	Word.xposc                 C   r  )zF Set the word's treebank-specific part-of-speech value. Example: 'NNP'FN)r  r  r`   r   r   r   r   m  r  c                 C   rZ   )zF Access the morphological features of this word. Example: 'Gender=Fem')r  r\   r   r   r   r   r  r^   z
Word.featsc                 C   r  )z> Set this word's morphological features. Example: 'Gender=Fem'FN)r  r  r`   r   r   r   r   w  r  c                 C   rZ   )z- Access the id of the governor of this word. )r  r\   r   r   r   r    |  r^   z	Word.headc                 C   s&   |  |dkrt|| _dS d| _dS )z# Set the word's governor id value. FN)r  r.  r  r`   r   r   r   r      s   &c                 C   rZ   )z= Access the dependency relation of this word. Example: 'nmod')r  r\   r   r   r   r!     r^   zWord.deprelc                 C   r  )z: Set the word's dependency relation value. Example: 'nmod'FN)r  r  r`   r   r   r   r!     r  c                 C   s   | j j}|du s|| jsdS g }tt|| jdd d}|D ]/}tt||| j}|D ]}t|t	rB|
d||f  q1|
d|d |d |f  q1q"|sVdS d	|S )
z' Access the dependencies of this word. Nc                 S   s   t | tr| S | fS r4   )r5   r   )rv   r   r   r   <lambda>  rs   zWord.deps.<locals>.<lambda>)r  rq  z%d.%d:%sr   r   r   )r  r*  has_noder   r   ry   predecessorsget_edge_datar5   r.  r   r   )r;   graphdatar  r   r"   rp  r   r   r   r"     s   

z	Word.depsc                 C   s   | j j}|du r|du rdS |du rt }|| j _|| jr-t|| j}|| |du r3dS t	|t
r=|d}tdd |D rMdd |D }|D ]"\}}ttt|jddd	}t|dkri|d
 }||| j| qOdS )z$ Set the word's dependencies value. Nr   c                 s   s    | ]}t |tV  qd S r4   )r5   r   ru   r   r   r   r     r  zWord.deps.<locals>.<genexpr>c                 S   s   g | ]	}|j d ddqS ):r   maxsplit)r   ru   r   r   r   rr     ra  zWord.deps.<locals>.<listcomp>r  r   r  r   )r  r*  nxMultiDiGraphr  r   ry   in_edgesremove_edges_fromr5   r   r   r   r   mapr.  rP   add_edge)r;   ra   r  r  r   rp  r   r   r   r"     s*   


c                 C   rZ   )z, Access the miscellaneousness of this word. r  r\   r   r   r   r#     r^   z	Word.miscc                 C   r  )z) Set the word's miscellaneousness value. FNr  r`   r   r   r   r#     r  c                 C   rZ   r  r  r\   r   r   r   r&     r^   zWord.start_charc                 C   r  r4   r  r`   r   r   r   r&     r  c                 C   rZ   r  r  r\   r   r   r   r'     r^   zWord.end_charc                 C   r  r4   r  r`   r   r   r   r'     r  c                 C   rZ   )z Access the parent token of this word. In the case of a multi-word token, a token can be the parent of
        multiple words. Note that this should return a reference to the parent token object.
        r  r\   r   r   r   r     r   zWord.parentc                 C   r_   )z Set this word's parent token. In the case of a multi-word token, a token can be the parent of
        multiple words. Note that value here should be a reference to the parent token object.
        Nr  r`   r   r   r   r     s   
c                 C   rZ   r  r  r\   r   r   r   pos  r^   zWord.posc                 C   r  r  r  r`   r   r   r   r    r  c                 C   rZ   )a  
        coref_chains points to a list of CorefChain namedtuple, which has a list of mentions and a representative mention.

        Useful for disambiguating words such as "him" (in languages where coref is available)

        Theoretically it is possible for multiple corefs to occur at the same word.  For example,
          "Chris Manning's NLP Group"
        could have "Chris Manning" and "Chris Manning's NLP Group" as overlapping entities
        r  r\   r   r   r   r+     s   zWord.coref_chainsc                 C   r_   )z& Set the backref for the coref chains Nr   )r;   r   r   r   r   r+     rb   c                 C   rZ   )z? Access the pointer to the sentence that this word belongs to. r  r\   r   r   r   r     r^   z	Word.sentc                 C   r_   )z< Set the pointer to the sentence that this word belongs to. Nr  r`   r   r   r   r     rb   c                 C   r  r  r	  r\   r   r   r   r    r  zWord.__repr__c                 C   s(   |dkr|   S |dkr|  S t| S )Nr  r  )r  r  r   r  r   r   r   r    s
   zWord.__format__c                 C   s   |   }t|dS )zS
        Turn a word into a conll representation (10 column tab separated)
        r  )r  r  )r;   r  r   r   r   r  !  s   
zWord.to_conll_textc                 C   s.   i }|D ]}t | |durt | |||< q|S )z+ Dumps the word into a dictionary.
        Nr   )r;   r   r  r   r   r   r   r  (  s   zWord.to_dictc                    sB   t tttttttg}d fdd|D }d j	j
 d| dS )z Print the word in one line. ;c                    s,   g | ]}t  |d urd|t  |qS )Nr  )r   r   )rp   kr\   r   r   rr   4  s   , z%Word.pretty_print.<locals>.<listcomp>r  r   >)r   r   r  r  r  r  r  r  r   r  r   )r;   featuresfeature_strr   r\   r   r  1  s   zWord.pretty_printc                 C   r  r  r   r`   r   r   r   r  7  r  zWord._is_nullN),r   r   r   r  rK   r  r   r  r   r   r   r   r   r   r    r!   r"   r#   r&   r'   r   r  r+   r   r  r  r  r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   r     s     



















 













(	r   c                   @   s  e Zd ZdZd'ddZdd Zdd Zed	d
 Zej	dd
 Zedd Z
e
j	dd Z
edd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	d d Zd!d" Zd#d$ Zd%d& ZdS )(rU  z A span class that stores attributes of a textual span. A span can be typed.
    A range of objects (e.g., entity mentions) can be represented as spans.
    Nc                 C   s   |dus|dur|dusJ d|dusJ ddgd \| _ | _| _| _g | _g | _|| _|| _|dur:| | |durF| 	|| dS dS )z Construct a span given a span entry or a list of tokens. A valid reference to a doc
        must be provided to construct a span (otherwise the text of the span cannot be initialized).
        NzMEither a span_entry or a token list needs to be provided to construct a span.z2A parent doc must be provided to construct a span.r-   )
r@   _typer  r  r"  r#  r%  r  init_from_entryinit_from_tokens)r;   
span_entryrN   r(   rn   r   r   r   r   rK   @  s   
zSpan.__init__c                 C   s<   | td | _| td | _| td | _| td | _d S r4   )	r   r   r   TYPEr(   r  r&   r  r'   )r;   r	  r   r   r   r  S  s   zSpan.init_from_entryc                 C   s   t |ts	J dt|dksJ d|| _|| _| jd j| _| jd j| _| jd ur>| jjd ur>| jj| j| j | _n2|d j	|d j	u rl|d j	}|d j|jd j }|d j|jd j }|j|| | _nt
ddd |D | _|d j	| _	d S )Nz6Tokens must be provided as a list to construct a span.r   z)Tokens of a span cannot be an empty list.rL   zxDocument text does not exist, and the span tested crosses two sentences, so it is impossible to extract the entity text!c                 S   s   g | ]
}|j D ]}|qqS r   r-  )rp   rq   rR  r   r   r   rr   l  r   z)Span.init_from_tokens.<locals>.<listcomp>)r5   ry   rP   rN   r(   r&   r'   rn   r   r   RuntimeErrorr   )r;   rN   r(   rR   
text_starttext_endr   r   r   r  Y  s    
zSpan.init_from_tokensc                 C   rZ   rG  rH  r\   r   r   r   rn   p  r^   zSpan.docc                 C   r_   rI  rH  r`   r   r   r   rn   u  rb   c                 C   rZ   )z= Access the text of this span. Example: 'Stanford University'rc   r\   r   r   r   r   z  r^   z	Span.textc                 C   r_   )z: Set the span's text value. Example: 'Stanford University'Nrc   r`   r   r   r   r     rb   c                 C   rZ   )zD Access reference to a list of tokens that correspond to this span. rL  r\   r   r   r   rN     r^   zSpan.tokensc                 C   r_   )z  Set the span's list of tokens. NrL  r`   r   r   r   rN     rb   c                 C   rZ   )zC Access reference to a list of words that correspond to this span. rN  r\   r   r   r   r     r^   z
Span.wordsc                 C   r_   )z Set the span's list of words. NrN  r`   r   r   r   r     rb   c                 C   rZ   )z0 Access the type of this span. Example: 'PERSON'r  r\   r   r   r   r(     r^   z	Span.typec                 C   r_   )z Set the type of this span. Nr  r`   r   r   r   r(     rb   c                 C   rZ   )z1 Access the start character offset of this span. r  r\   r   r   r   r&     r^   zSpan.start_charc                 C   r_   )z. Set the start character offset of this span. Nr  r`   r   r   r   r&     rb   c                 C   rZ   )z/ Access the end character offset of this span. r  r\   r   r   r   r'     r^   zSpan.end_charc                 C   r_   )z, Set the end character offset of this span. Nr  r`   r   r   r   r'     rb   c                 C   rZ   )z? Access the pointer to the sentence that this span belongs to. r  r\   r   r   r   r     r^   z	Span.sentc                 C   r_   )z< Set the pointer to the sentence that this span belongs to. Nr  r`   r   r   r   r     rb   c                    s"   g d}t  fdd|D }|S )z# Dumps the span into a dictionary. )r   r(   r&   r'   c                    s   g | ]	}|t  |fqS r   r   )rp   	attr_namer\   r   r   rr     ra  z Span.to_dict.<locals>.<listcomp>)dict)r;   attrs	span_dictr   r\   r   r    s   zSpan.to_dictc                 C   r  r  r	  r\   r   r   r   r    r  zSpan.__repr__c                 C   s6   |   }ddd | D }d| jj d| dS )z Print the span in one line. r  c                 S   s   g | ]
\}}d  ||qS )r  r  )rp   r  vr   r   r   rr     r   z%Span.pretty_print.<locals>.<listcomp>r  r   r  )r  r   itemsr  r   )r;   r  r  r   r   r   r    s   zSpan.pretty_print)NNNNN)r   r   r   r  rK   r  r  r  rn   r  r   rN   r   r(   r&   r'   r   r  r  r  r   r   r   r   rU  ;  sP    
















rU  )r  )>r  rx  	itertoolsr   rer8   r  r>  enumr   networkxr  "stanza.models.common.stanza_objectr   stanza.models.common.utilsr   r   r   r   stanza.models.ner.utilsr	   stanza.models.constituencyr
   stanza.models.coref.coref_chainr   r   r   r   compilemulti_word_token_idr   r  r   r   r  r  r  r  r  r  r  r  r  r  r  r  r
  	SENTIMENTCONSTITUENCYr  r  rP   r  r9   r3   r=   r   r  r  r/  r   rU  r   r   r   r   <module>   sp    

,   ]   &
-    +