o
    hT(                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ edZe Zdd Zd	d
 Zdd Zd!ddZd"ddZd#ddZedejZedejZedejZdd Zd$dd ZdS )%zF
Common methods for the various self-training data collection scripts
    N)utils)TextTooLongError)get_tqdmstanzac                 C   sh   | j dddd | j dddd | j dtd	d
d | j dddd | j dddd | j ddddd d S )Nz--output_filezdata/constituency/vi_silver.mrgzWhere to write the silver trees)defaulthelpz--langviz4Which language tools to use for tokenization and POSz--num_sentencesz(How many sentences to get per file (max))typer   r   z--modelsz.saved_models/constituency/vi_vlsp21_inorder.ptz0What models to use for parsing.  comma-separatedz	--packager   z<Which package to load pretrain & charlm from for the parsersz--output_ptbF
store_truezCOutput trees in PTB brackets (default is a bracket language format))r   actionr   add_argumentintparser r   g/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/selftrain.pycommon_args   s@   
r   c                 C   sP   | j ddtdd | j dddd dd	 | j d
dtdd | j dddd dd	 d S )Nz	--min_len   z2Minimum length sentence to keep.  None = unlimited)r   r
   r   z--no_min_lenmin_lenstore_constzNo minimum length)destr   constr   z	--max_lend   z2Maximum length sentence to keep.  None = unlimitedz--no_max_lenmax_lenzNo maximum lengthr   r   r   r   r   add_length_args4   s4   
r   c                 C   s"   | r	t j|ddS t j|dddS )Ntokenize)
processorsT)r   tokenize_no_ssplitr   Pipeline)ssplitlangr   r   r   build_ssplit_pipeP   s   r$   c                 C   s&   | r
t j|d|dS t j|dd|dS )Nztokenize,pos)r   foundation_cacheT)r   r   r%   r    )r"   r#   r%   r   r   r   build_tag_pipeV   s   r&   r   c              	   C   s^   g }| dD ]%}tj|rtj| d||d|d}ntj| d|idd|d}|| q|S )z
    Build separate pipelines for each parser model we want to use

    It is highly recommended to pass in a FoundationCache to reuse bottom layers
    ,constituencyT)r   packageconstituency_model_pathconstituency_pretaggedr%   N)r   r+   r)   r%   )splitospathexistsr   r!   append)r#   modelsr)   r%   parser_pipes
model_namepiper   r   r   build_parser_pipes\   s   r5      2     c                    s   d}d}g }t dt|  ttdt| |D ]D}| |||  }	dd |	D }	||	}	dd |	D }
|t|
7 } fdd|
D }
fdd|
D }
|t|
7 }|dd |
D  qt d	| t d
| |S )aT  
    Using the ssplit pipeline, break up the documents into sentences

    Filters out sentences which are too long or have words too long.

    This step is necessary because some web text has unstructured
    sentences which overwhelm the tagger, or even text with no
    whitespace which breaks the charlm in the tokenizer or tagger
    r   z%Splitting raw docs into sentences: %dc                 S      g | ]	}t jg |d qS textr   Document.0tr   r   r   
<listcomp>~       zsplit_docs.<locals>.<listcomp>c                 S   s   g | ]
}|j D ]}|qqS r   )	sentences)r@   dsr   r   r   rB      s    c                    s   g | ]}t |j k r|qS r   lenwordsr@   rF   r   r   r   rB          c                    (   g | ]}t d d |jD  k r|qS )c                 s       | ]}t |jV  qd S NrH   r<   r@   wr   r   r   	<genexpr>       z(split_docs.<locals>.<listcomp>.<genexpr>)maxrI   rJ   )max_word_lenr   r   rB         ( c                 S   s   g | ]}|j qS r   r;   rJ   r   r   r   rB      s    zSplit sentences: %dz!Sentences filtered for length: %d)loggerinforH   tqdmrangeextend)docsssplit_piper   rV   
chunk_sizeraw_sentencesfiltered_sentencesnew_docschunk_startchunkrD   r   )r   rV   r   
split_docsm   s"   
re   uQ   [⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]u&   [一-龠ぁ-ゔァ-ヴー々〆〤ヶ]u	   [ऀ-ॿ]c           
         s  g }dd | D } t | dkr|S ||  |jo|jd}|jo&|jd}|jo/|jd}| D ]}|jD ] |rCt  j|k rCq7|rMt  j|krMq7 j}	|	ddks|	ddks|	d	dks|	d
dks|	ddks|	ddks|	ddkrq7t fdddD rq7dd  jD }	d|	}	tdd  jD rq7|st t	
|	dkrq7|st t
|	dkrq7|rt t
|	dkrq7||	 q7q2|S )z
    Turn the text in docs into a list of whitespace separated sentences

    docs: a list of strings
    pipe: a Stanza pipeline for tokenizing
    min_len, max_len: can be None to not filter by this attribute
    c                 S   r9   r:   r=   r?   r   r   r   rB      rC   z!tokenize_docs.<locals>.<listcomp>r   zhjar   |_<>[]u   —c                 3   s(    | ] t  fd djD V  qdS )c                 3   s.    | ]}|j  d kot|j dkV  qdS )r      N)r<   findrH   rQ   cr   r   rS      s   , z*tokenize_docs.<locals>.<genexpr>.<genexpr>N)anyrI   )r@   sentencerp   r   rS      s     z tokenize_docs.<locals>.<genexpr>z"()c                 S   s   g | ]	}|j d dqS ) ri   )r<   replacerQ   r   r   r   rB      rC   ru   c                 s   s    | ]
}t |jd kV  qdS )r7   NrP   rQ   r   r   r   rS      s          r   )rH   r#   
startswithrD   rI   r<   ro   rr   joinZH_REfindallJA_REDEV_REr0   )
r]   r4   r   r   resultsis_zhis_jais_vidocr<   r   rs   r   tokenize_docs   sJ   

$r   T
   Fc
                    s  |dk r	t | }
n|}
|	rdndt|
dd}|r t|  t }tdt | |D ]}| |||  }dd |D }|dk rG|t | || dd |D } d	ur_ fd
d|D }t |dkrfq+g }z|D ]}|| fdd|D }|| qkW n ty } zW Y d	}~q+d	}~ww t	| D ]=}t t|dkrq|d }||v rq||vr|
| |dkr|d |dkrt ||kr|    W  d	   S qq+W d	   |S 1 sw   Y  |S )a  
    Find trees where all the parsers in parser_pipes agree

    docs should be a list of strings.
      one sentence per string or a whole block of text as long as the tag_pipe can break it into sentences

    num_sentences > 0 gives an upper limit on how many sentences to extract.
      If < 0, all possible sentences are extracted

    accepted_trees is a running tally of all the trees already built,
      so that we don't reuse the same sentence if we see it again
    r   z{}z{:L}F)totalleavec                 S   r9   r:   r=   r?   r   r   r   rB      rC   z'find_matching_trees.<locals>.<listcomp>c                 S   s   g | ]}t |jd kr|qS )r   )rH   rD   r@   rE   r   r   r   rB      rL   Nc                    rM   )c                 s   rN   rO   rG   rJ   r   r   r   rS      rT   z1find_matching_trees.<locals>.<listcomp>.<genexpr>)rU   rD   r   rK   r   r   rB      rW   c                    s2   g | ]}|j D ]}t|j kr|jqqS r   )rD   rH   rI   formatr(   )r@   r   sent)r   output_formatr   r   rB      s   2 rn   )rH   rZ   randomshufflesetr[   updater0   r   zipadd)r]   num_sentencesaccepted_treestag_piper2   r   r_   r   r   
output_ptb
tqdm_totalpbar	new_treesrc   rd   parsesr4   treesetreer   )r   r   r   r   find_matching_trees   sh   





,,r   rO   )r   N)r6   r7   r8   )Tr   r6   r   F)__doc__loggingr-   r   rer   stanza.models.commonr   #stanza.models.common.bert_embeddingr   stanza.utils.get_tqdmr   	getLoggerrX   rZ   r   r   r$   r&   r5   re   compileUNICODEr{   r}   r~   r   r   r   r   r   r   <module>   s,    
"


7