o
    h                     @   s`   d dl Z d dlmZ d dlmZ d dlmZmZ dd Zddd	Z	d
d Z
edkr.e
  dS dS )    N)doc)TokenizationDataset)predictdecode_predictionsc                 C   sb   t | jD ])\}}t |jD ]\}}t|jdkr-ddd |jD }|j|kr-  dS qqdS )zf
    Return True/False if the MWTs in the doc are all exactly composed of the text in their words
        c                 s   s    | ]}|j V  qd S )N)text).0x r   R/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/mwt/utils.py	<genexpr>   s    z)mwts_composed_of_words.<locals>.<genexpr>FT)	enumerate	sentencestokenslenwordsjoinr   )r   sent_idxsentence	token_idxtokenexpectedr   r   r   mwts_composed_of_words   s   
r   Tc              
   C   sr  d|j vr	tdd|j vrtd|j d }|j d }ddd | D }t|j||jd|jjd	}t|j||jj	d
 |j
d|jd|j
ddd\}}|rt| |D ];\}	}
d}|	D ]2}t|dkrrd|
||t| d < |
|t| d  dkrd|
|t| d < |t|d 7 }q^qVt|jd|||d|jj	d |jj	d dkd\}}}t||}|| |S )a  
    Uses the tokenize processor and the mwt processor in the pipeline to resplit tokens into MWT

    tokens: a list of list of string
    pipeline: a Stanza pipeline which contains, at a minimum, tokenize and mwt

    keep_tokens: if True, enforce the old token boundaries by modify
      the results of the tokenize inference.
      Otherwise, use whatever new boundaries the model comes up with.

    between running the tokenize model and breaking the text into tokens,
    we can update all_preds to use the original token boundaries
    (if and only if keep_tokens == True)

    This method returns a Document with just the tokens and words annotated.
    tokenizez/Need a Pipeline with a valid tokenize processormwtz*Need a Pipeline with a valid mwt processorz

c                 s   s    | ]}d  |V  qdS ) N)r   )r	   r   r   r   r   r   +   s    zresplit_mwt.<locals>.<genexpr>T)
input_textvocab
evaluation
dictionary
batch_size
max_seqlennum_workersr   )trainerdata_generatorr!   r"   use_regex_tokensr#   r   Nskip_newline	shorthandla_ittb)r   mwt_dict	orig_textall_raw	all_preds	no_ssplitr'   use_la_ittb_shorthand)
processors
ValueErrorr   r   configr   r$   r    r   argsgetMAX_SEQ_LENGTH_DEFAULTzipr   r   r   Documentprocess)r   pipelinekeep_tokenstokenize_processormwt_processor	fake_textbatchesr-   r,   r   predchar_idxword_documentr   r   r   resplit_mwt   sT   






	
rD   c                  C   sL   t jdddd} g dddgg}t|| }t| t|| dd	}t| d S )
Nenztokenize,mwtgum)r0   package)Izcan'tbelieveitzI can'tsleepF)r:   )stanzaPipelinerD   print)piper   r   r   r   r   mainR   s   
rP   __main__)T)rL   stanza.models.commonr   stanza.models.tokenization.datar    stanza.models.tokenization.utilsr   r   r   rD   rP   __name__r   r   r   r   <module>   s    
>	
