o
    h,f                     @   s\  d dl mZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dl	m
Z d dlm  mZ d dlmZmZ d dlmZ d dlmZ d dlT d d	lmZ ed
Ze Zdd Zd1ddZdd Zdd Zd2ddZ dZ!dZ"e#de! de" dZ$dd Z%dd Z&e#dZ'e#dZ(d d! Z)d3d%d&Z*d2d'd(Z+d)d* Z,d+d, Z-d-d. Z.d/d0 Z/dS )4    )Counter)copyN)
DataLoader)	ud_scoresharmonic_mean)Document)CoNLL)*)SortedDatasetstanzac                    s@   t  t  t  d  fdd}| D ]}t|dkr|| q S )a  
    This function is to create a new dictionary used for improving tokenization model for multi-syllable words languages
    such as vi, zh or th. This function takes the lexicon as input and output a dictionary that contains three set:
    words, prefixes and suffixes where prefixes set should contains all the prefixes in the lexicon and similar for suffixes.
    The point of having prefixes/suffixes sets in the  dictionary is just to make it easier to check during data preparation.

    :param shorthand - language and dataset, eg: vi_vlsp, zh_gsdsimp
    :param lexicon - set of words used to create dictionary
    :return a dictionary object that contains words and their prefixes and suffixes.
    )wordsprefixessuffixesc                    s   |  d vr= d  |  d}d}tdt| d D ]$}|| |  }| t| | d  | } d  |  d  | qd S d S )Nr    r      r   r   )addrangelen)wordprefixsuffixi
dictionary [/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/tokenization/utils.pyadd_word#   s   z#create_dictionary.<locals>.add_wordr   )setr   )lexiconr   r   r   r   r   create_dictionary   s   r   c                    s  t  }g }tdfdd}|durhtj|s!td| tj|d}|j	D ]0}dd |j
D d	d |jD  }|D ]}	|	 }	|| |	rY|	|vrY||	 |t|	 q>q*t|}
td
|
 d |durtj|sytd| t|ddd}| }W d   n1 sw   Y  |D ]!}| }	|	dd}	|| |	r|	|vr||	 |t|	 qtdt||
  d tt|d  fdd|D }tdt| d | fS )a'  
    This function is to create a lexicon to store all the words from the training set and external dictionary.
    This lexicon will be saved with the model and will be used to create dictionary when the model is loaded.
    The idea of separating lexicon and dictionary in two different phases is a good tradeoff between time and space.
    Note that we eliminate all the long words but less frequently appeared in the lexicon by only taking 95-percentile
    list of words.

    :param shorthand - language and dataset, eg: vi_vlsp, zh_gsdsimp
    :param train_path - path to conllu train file
    :param external_path - path to extenral dict, expected to be inside the training dataset dir with format of: SHORTHAND-externaldict.txt
    :return a set lexicon object that contains all distinct words
    z(?:[^\d\W]+)|\sc                    s   |  dr"t|ddkr tttj|r tttj|s dS dS |  drAt|dkr?tt j|r?tttj|s?dS dS t|dkrYtttj|rYtttj|sYdS dS )z
        This function is to check if the word are multi-syllable words and not numbers. 
        For vi, whitespaces are syllabe-separator.
        vi_ r   TFth_)	
startswithr   splitanymapstrisalphaisdigitmatch)	shorthandr   )pattern_thair   r   check_valid_wordF   s
   
:
44z(create_lexicon.<locals>.check_valid_wordNzCannot open train set at )
input_filec                 S   s   g | ]	}|  r|jqS r   )is_mwttext.0xr   r   r   
<listcomp>Z   s    z"create_lexicon.<locals>.<listcomp>c                 S   s   g | ]}|j qS r   )r0   r1   r   r   r   r4   Z   s    zAdded z- words from the training data to the lexicon.z#Cannot open external dictionary at rzutf-8)encoding
r   zAdded another z, words from the external dict to dictionary._   c                    s   h | ]
}t | kr|qS r   r   r2   r   )num_dict_featr   r   	<setcomp>v       z!create_lexicon.<locals>.<setcomp>zFinal lexicon consists of z' words after getting rid of long words.)r   recompileospathisfileFileNotFoundErrorr   	conll2doc	sentencestokensr   lowerr   appendr   loggerinfoopen	readlinesreplaceintnp
percentile)r+   
train_pathexternal_pathr   length_freqr-   	train_doc
train_senttrain_wordsr   
count_wordexternal_filelinesliner   )r;   r,   r   create_lexicon4   sJ   

 


r[   c                 C   s   | d }t d }| d| d}| d| d}tj|s%td d}tj|s5td|  d}|du rG|du rGtd	| d
| t|||S )z
    This function is to create a new dictionary and load it to training.
    The external dictionary is expected to be inside the training dataset dir with format of: SHORTHAND-externaldict.txt
    For example, vi_vlsp-externaldict.txt
    r+   TOKENIZE_DATA_DIR/z.train.gold.conlluz-externaldict.txtz8External dictionary not found! Checking training data...Nz?Training dataset does not exist, thus cannot create dictionary z2Cannot find training set / external dictionary at z and )pathsr@   rA   existsrI   rJ   rC   r[   )argsr+   tokenize_dirrQ   external_dict_pathr   r   r   load_lexicon{   s   
rc   c                 C   s   | d ur@t | d}t|}W d    n1 sw   Y  t }|D ]}|\\}}}||vs7|| d |k r=||f||< q#|S d S )Nr5   r   )rK   jsonloaddict)filenamef	mwt_dict0mwt_dictitemkey	expansioncountr   r   r   load_mwt_dict   s   ro   c           	      C   sZ  g }d}| D ]\}}}d }|dks|dkr2|d ur2||v r$|| d }n|  |v r2||   d }|d urq|t|d |t| ft|i |d ur[|d |d t< |d |d t< |D ]}|t|d ft|i |d7 }q]qt|dkrxq|t|d ft|i |d ur|d |d t< |d |d t< |dks|dkrd|d t< |d7 }q|S )Nr         r   MWT=Yes)rG   rH   IDr   TEXT
START_CHAREND_CHARMISC)	sentencerj   sentr   tokpposition_inform   etokr   r   r   process_sentence   s8   "

r   a  (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(?:2(?:5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])a  (?:https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s"]{2,}|www\.[a-zA-Z0-9]+\.[^\s"]{2,})|[a-zA-Z0-9]+\.(?:gov|org|edu|net|com|co)(?:\.[^\s"]{2,})z(?:|)c                 C   s   dd t | D }t|dkrdt| fg}|S d}g }|D ]}||kr+|||f |d }q|t| k r?||t| f |S )zO
    Return spans of text which don't contain <PAD> and are split by <PAD>
    c                 S   s   g | ]
\}}|d kr|qS <PAD>r   )r2   idxcharr   r   r   r4      r=   zfind_spans.<locals>.<listcomp>r   r   )	enumerater   rH   )rawpadsspansprevpadr   r   r   
find_spans   s   

r   c           
      C   s   t | }|D ]?\}}d| || }t|D ],}| \}}t|| || d D ]}	d||	< q+||| d  dkrDd||| d < qq|S )a  
    Update the results of a tokenization batch by checking the raw text against a couple regular expressions

    Currently, emails and urls are handled
    TODO: this might work better as a constraint on the inference

    for efficiency pred is modified in place
    r   r   r   )r   joinMASK_REfinditerspanr   )
r   predr   
span_beginspan_endr0   r*   match_begin	match_endr   r   r   r   update_pred_regex   s   	

r   z\sz	( *[^ ]+)c              	      s  g }g }t |}t|||j|d}	t|	D ]d\}
}t|d }t|d d }|d D ]	}|t| q,|krEtj| 	|dd}ndg| }dg| }dd |d D }dd t
|D }	 fd
dt||D }t| |d ddd f |d ddd f |d ddd f  fdd|d D f}tj| 	|dd}t
|D ]J}t|| dk|| dk d }t|dks|| ||  kr|| }nt|d }||  ||d|f g7  < ||  |7  < |||< qtdd t||D rn|||}qbdd |D }t
|D ]a}|
| | }|| }|d}|d| }|||< || |d  dk rGd|| |d < n|| |d  dkrZd|| |d < |rm|t||| d|  q||| d|  qq||}||}||fS )z
    The guts of the prediction method

    Calls trainer.predict() over and over until we have predictions for all of the text
    )
batch_size
collate_fnnum_workersrp   r      )axisc                 S   s   g | ]}| d qS r   )indexr1   r   r   r   r4         zpredict.<locals>.<listcomp>c                 S   s   g | ]}g qS r   r   )r2   _r   r   r   r4     s    Tc                    s   g | ]\}}t ||  qS r   )minr2   idx1N)
max_seqlenr   r   r4     s    Nr   c                    s   g | ]}|d   qS Nr   r1   )enr   r   r4         rq   c                 S   s   g | ]\}}||kqS r   r   r   r   r   r   r4   #  r   c                 S   s   g | ]}t |d qS )r   )rO   concatenater2   r|   r   r   r   r4   )  r   r   )r
   TorchDataLoadercollater   r   rH   listrO   argmaxpredictr   zipmaxwherealladvance_old_batchr   r   unsort)trainerdata_generatorr   r   use_regex_tokensr   	all_predsall_rawsorted_data
dataloader	batch_idxbatchnum_sentencesr   	paragraphr   r   advpara_lengthsensbatch1pred1j
sentbreaksadvancepar_idxoffsetr   par_lenr   )r   r   r   r      sd   

Z" 


 

r     FTc              	   C   s   |j d }td|}t||||||	\}}|j d dk}|j d }t||||||||\}}}|
r7t||
|}| r?t||  ||||fS )Nr   r   r+   la_ittbskip_newline)r`   r   r   decode_predictionspostprocess_docr   
dict2conll)output_filer   r   vocabrj   r   	orig_text	no_ssplitr   r   postprocessorr   r   r   use_la_ittb_shorthandr   	oov_countr   docr   r   r   output_predictions@  s   


r   c                 C   sP  dd | D }|sd dd tD }n|}||}g }g }g }|D ]b}	g }
g }g }|	D ]H}t|trC|
| |d |d q,t|d tr^|
|d	  ||d  |d q,|
|d	  |d
 |d |d  q,||
 || || q"dd |D }dd |D }||ksJ d||f t||||} | S )z"Applies a postprocessor on the docc                 S   s   g | ]	}d d |D qS )c                 S   s.   g | ]}| d dkr|d dfn|d qS )miscrs   r0   T)getr:   r   r   r   r4   X  s    z.postprocess_doc.<locals>.<listcomp>.<listcomp>r   )r2   ry   r   r   r   r4   X  s
    
z#postprocess_doc.<locals>.<listcomp>r   c                 s   s    | ]}d  |V  qdS )r   Nr   r2   r   r   r   r   	<genexpr>]      z"postprocess_doc.<locals>.<genexpr>FNr   r   Tr!   c                 S      g | ]}t |qS r   r9   r   r   r   r   r4         c                 S   r   r   r9   r   r   r   r   r4     r   zjPostprocessor returned token and MWT lists of different length! Token list lengths %s, MWT list lengths %s)r   r   
isinstancer'   rH   boolreassemble_doc_from_tokens)r   r   r   r   raw_textpostprocessor_returncorrected_wordscorrected_mwtscorrected_expansionsrz   
sent_words	sent_mwtssent_expansionsr   
token_lensmwt_lensr   r   r   r   R  sH   





r   c                 C   s.  d}g }t | ||D ]w\}}}g }	tt |||D ]b\}
\}}}z|||}W n, tyT } z td|d }tt||d }||| }td|||f |d}~ww |
d f|||t| d}|rjd|d< n|rpd	|d
< |	| |t| }q||	 q
t||}|j	dd |D dd |
 S )a  Assemble a Stanza document list format from a list of string tokens, calculating offsets as needed.

    Parameters
    ----------
    tokens : List[List[str]]
        A list of sentences, which includes string tokens.
    mwts : List[List[bool]]
        Whether or not each of the tokens are MWTs to be analyzed by the MWT system.
    expansions : List[List[Optional[List[str]]]]
        A list of possible expansions for MWTs, or None if no user-defined expansion
        is given.
    parser_text : str
        The raw text off of which we can compare offsets.

    Returns
    -------
    List[List[Dict]]
        List of words and their offsets, used as `doc`.
    r      zCould not find word |%s| starting from char_offset %d.  Surrounding text: |%s|. 
 Hint: did you accidentally add/subtract a symbol/character such as a space when combining tokens?Nr   )idr0   
start_charend_charTmanual_expansionrs   r   c                 S   s   g | ]}|D ]}|r|qqS r   r   )r2   r   r   r   r   r   r4     s    z.reassemble_doc_from_tokens.<locals>.<listcomp>)process_manual_expanded)r   r   r   
ValueErrorr   r   r   rH   r   set_mwt_expansionsto_dict)rF   mwts
expansionsr   
new_offsetcorrected_docr   r   r   sentence_docindxr   mwtrm   offset_indexe	sub_startsub_endsubwdr   r   r   r   r     s@   





r   c           "      C   s  d}d}	g }
|durt d|nd}d}| dur| d}t||D ]5\}}d}g }t||D ]\}}|dkr< n|rD|dv rDd}|d	7 }| durW| ||krW|	d	7 }	||7 }|d	kr?| durj| |}n|}d
|vstJ |t|dkr}d}q0|durd}d}t|D ]}t|dkrq|rt	d
dd |D }|||}|d| }|d|d }|d }n?z
|||| }W n, ty } z td|d }tt||d }||| } td||| f |d}~ww t|}| }|dk r|| |t|  }||| 7 }q||f}!nd}!||||!f d}|dks2|dkr?|s?|
t|| g }q0t|dkrKtdt|rX|
t|| q"|	||
fS )z
    Decode the predictions into a document of words

    Once everything is fed through the tokenizer model, it's time to decode the predictions
    into actual tokens and sentences that the rest of the pipeline uses
    r   Nr!   z<UNK>r   r   ):;r   r   	rr   z\s*c                 s   s    | ]}t |V  qd S r   )r>   escape)r2   cr   r   r   r     r   z%decode_predictions.<locals>.<genexpr>r   zICould not find |%s| starting from char_offset %d.  Surrounding text: |%s|rq   9Finished processing tokens, but there is still text left!)SPACE_REr  unit2idr   normalize_tokenr   SPACE_SPLIT_REr$   r>   r?   r   searchstartendgrouplstripr   r   r   r   rH   r   )"r   rj   r   r   r   r   r   r   r   r   r   r0   char_offsetUNK_IDr   r   current_tokcurrent_senttr|   r{   sttok_lenpartpart_patternr*   st0partlen	lstrippedr   r   r  r  r}   r   r   r   r     s   






r   c              	   C   s   d dd | D }t|}dgt| }d}| D ]}|D ]}|t|7 }d||d < qd||d < qtdd||g|gddd\}}}	t|	|}	|	jd	 jd	 j}||d  }
t|
dkrct	d
|	S )a/  
    Turns pretokenized text and the original text into a Doc object

    sentences: list of list of string
    orig_text: string, where the text must be exactly the sentences
      concatenated with 0 or more whitespace characters

    if orig_text deviates in any way, a ValueError will be thrown
    r   c                 S   s   g | ]}d  |qS )r   r   r1   r   r   r   r4   *  r   z*match_tokens_with_text.<locals>.<listcomp>r   r   r   NFrr   r	  )
r   r   r   r   r   rE   rF   r   stripr   )rE   r   r0   r   r   r   ry   r   r   r   	remainderr   r   r   match_tokens_with_text   s    
 
r!  c              
   C   s   t | d ||||| d \}}}}t|d}t| }	tt||	}
dd }|||	dddddd}|||	dddddd}|||	dddddd}t| d	  d
|d dd|d dd|d d t|||gg dS )N
conll_filer   r   c                    s    fdd| D }  fdd|D }d}d}d}d}d}t t| |D ]I\}\}	}
|	|
  kr3dkrBn n||krB|}|}|d7 }q#|	dkrW|
dkrW|}|}|d7 }|d7 }q#|	dkrb|}|d7 }q#|
dkrl|}|d7 }q#|dkrsdS d| d| | |  S )Nc                       g | ]} | qS r   r   r   mappingr   r   r4   G  r   z*eval_model.<locals>.f1.<locals>.<listcomp>c                    r#  r   r   )r2   gr$  r   r   r4   H  r   rr   r   r   r   )r   r   )r   goldr%  lastplastgtpfpfnr   r|   r&  r   r$  r   f1F  s0    


zeval_model.<locals>.f1r   )r   r   r   rp   rq   r   r+   z: token F1 = d   z.2fz, sentence F1 = z, mwt F1 = )r   r   g{Gz?)	r   rO   r   labelsr   r   rI   rJ   r   )r`   r   batchesr   rj   r   r   r   r   r/  counterr-  f1tokf1sentf1mwtr   r   r   
eval_model?  s   "6r5  )NNNr   )r   NFTr   N)0collectionsr   r   rd   numpyrO   r>   loggingr@   torch.utils.datar   r   stanza.utils.default_pathsutilsdefault_pathsstanza.models.common.utilsr   r   stanza.models.common.docr   stanza.utils.conllr   stanza.models.tokenization.datar
   	getLoggerrI   get_default_pathsr^   r   r[   rc   ro   r   EMAIL_RAW_RE
URL_RAW_REr?   r   r   r   r
  r  r   r   r   r   r   r!  r5  r   r   r   r   <module>   sF    

G
!


H
?>Q