o
    h                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZ edZe Zdd	 Zd
d Zdd ZdddZdd Zdd ZdS )    N)utils)
retag_tags)Trainer)
read_trees)get_tqdmstanzac                 C   st   t | dd}| }W d   n1 sw   Y  dd |D }dd |D }dd |D }dgt| }||fS )zk
    Read sentences from a tokenized file, potentially replacing _ with space for languages such as VI
    utf-8encodingNc                 S      g | ]}|  qS  strip.0xr   r   e/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/constituency/text_processing.py
<listcomp>       z'read_tokenized_file.<locals>.<listcomp>c                 S      g | ]}|r|qS r   r   r   r   r   r   r      r   c                 S   s   g | ]}d d |  D qS )c                 S   .   g | ]}t d d |D r|n|ddqS )c                 s       | ]}|d kV  qdS _Nr   r   r   r   r   	<genexpr>       z<read_tokenized_file.<locals>.<listcomp>.<listcomp>.<genexpr>r    allreplacer   wordr   r   r   r         . z2read_tokenized_file.<locals>.<listcomp>.<listcomp>)split)r   sentencer   r   r   r      s    )open	readlineslen)tokenized_filefinlinesdocsidsr   r   r   read_tokenized_file   s   
r-   c                 C   sh  t | dd}| }W d   n1 sw   Y  dd |D }dd |D }g }g }d}g }|D ]}}|dr[|d}t|d	krX|d	 }|d
rS|dd }t|}q2d}q2|drt|dkrltd|  || d	|}t
|}dd |D }t|d	krtdt|| f |d }	|	 }
dd |
D }
||
 g }d}q2|| q2||fS )z
    Read sentences from a file of the format unique to VLSP test sets

    in particular, it should be multiple blocks of

    <s id=1>
      (tree ...)
    </s>
    r   r	   Nc                 S   r   r   r   r   r   r   r   r   &   r   z&read_xml_tree_file.<locals>.<listcomp>c                 S   r   r   r   r   r   r   r   r   '   r   z<s=   >z</sr   zFound a blank tree in %s
c                 S   s   g | ]}|   qS r   )
prune_nonesimplify_labels)r   tr   r   r   r   =       z Found a tree with %d trees in %sc                 S   r   )c                 s   r   r   r   r   r   r   r   r   B   r   z0read_xml_tree_file.<locals>.<listcomp>.<genexpr>r   r   r   r    r   r   r   r   B   r"   )r%   r&   
startswithr#   r'   endswithint
ValueErrorappendjoinr   leaf_labels)	tree_filer)   r*   r+   r,   tree_id	tree_textlinetreestreetextr   r   r   read_xml_tree_file   sF   









rE   c                 C   sj   t ||| }dd t||D }td t|t|ks!J |jtt||j	| d |j
dd}|S )zI
    Parse the given sentences, return a list of ParseResult objects
    c                 S   s$   g | ]\}}d d t ||D qS )c                 S   s   g | ]\}}||fqS r   r   )r   r!   tagr   r   r   r   Q   r6   z8parse_tokenized_sentences.<locals>.<listcomp>.<listcomp>)zip)r   s_wordss_tagsr   r   r   r   Q   s   $ z-parse_tokenized_sentences.<locals>.<listcomp>z(Retagging finished.  Parsing tagged texteval_batch_sizeF)keep_scores)r   	uses_xposrG   loggerinfor'   parse_sentences_no_graditertqdmbuild_batch_from_tagged_wordspredict)argsmodelretag_pipeline	sentencestagswordstreebankr   r   r   parse_tokenized_sentencesL   s   
$r[   c              	   C   s  |   |du r| d r| d }| d rtj| d |}|du r%| d }d\}}|dur4t|\}}n| d rItd| d   t| d \}}|sRtd dS td	t	| t
|^}d
}tdt	||D ]J}	||	|	|  }
||	|	|  }td|	|	t	|
  t| |||
}t||D ] \}}|jd j}|dur||_|| d | |d qqjW d   dS 1 sw   Y  dS )zv
    Use the given model to parse text and write it

    refactored so it can be used elsewhere, such as Ensemble
    Npredict_filepredict_dirr(   NNxml_tree_filezReading trees from %szNo sentences to process!zProcessing %d sentencesi'  r   zProcessing trees %d to %dpredict_formatr2   )evalospathr<   r-   rM   rN   rE   errorr'   r   output_streamranger[   rG   predictionsrC   r?   writeformat)rT   rU   rV   r(   r\   r+   r,   fout
chunk_sizechunk_startchunk	ids_chunkrZ   resultr?   rC   r   r   r   
parse_textX   sD   
"rp   c                 C   sn   t j|dd t |D ](}t j||}t j|t j|d d }td|| t| ||||d qd S )NT)exist_okr   z.mrgzProcessing %s to %s)r(   r\   )	rb   makedirslistdirrc   r<   splitextrM   rN   rp   )rT   rU   rV   tokenized_dirr]   filename
input_pathoutput_pathr   r   r   	parse_dir   s   ry   c                 C   s   |r|d j nt }| d | d | d | d d}tj|||d}|j}|  td| | d	 rH| d
 s:tdt	| ||| d	 | d
  dS t
| || dS )z
    Load a model, then parse text and write it to stdout or args['predict_file']

    retag_pipeline: a list of Pipeline meant to use for retagging
    r   wordvec_pretrain_filecharlm_forward_filecharlm_backward_filedevice)rz   r{   r|   r}   )rT   foundation_cachezLoaded model from %sru   r]   z6Must specific --predict_dir to go with --tokenized_dirN)r~   FoundationCacher   loadrU   ra   rM   rN   r:   ry   rp   )rT   
model_filerV   r~   	load_argstrainerrU   r   r   r   load_model_parse_text   s   r   r^   )rb   loggingstanza.models.commonr    stanza.models.constituency.utilsr   "stanza.models.constituency.trainerr   &stanza.models.constituency.tree_readerr   stanza.utils.get_tqdmr   	getLoggerrM   rQ   r-   rE   r[   rp   ry   r   r   r   r   r   <module>   s    
2
-	