o
    h                     @   s   d Z ddlZddlmZ ddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ e Zdd Zd	d
 Zdd Zdd ZedkrFe  dS dS )a
  Builds a self-training dataset from an Italian data source and two models

The idea is that the top down and the inorder parsers should make
somewhat different errors, so hopefully the sum of an 86 f1 parser and
an 85.5 f1 parser will produce some half-decent silver trees which can
be used as self-training so that a new model can do better than either.

The dataset used is PaCCSS, which has 63000 pairs of sentences:

http://www.italianlp.it/resources/paccss-it-parallel-corpus-of-complex-simple-sentences-for-italian/
    N)deque)FoundationCache)	selftrain)get_tqdmc                  C   sP   t jdd} t|  | jdddd | jddd	d
d | jdd |  }|S )NzFScript that converts part of a wikipedia dump to silver standard trees)descriptionz--input_dirz%extern_data/vietnamese/wikipedia/textz<Path to the wikipedia dump after processing by wikiextractor)defaulthelpz--no_shuffleshufflestore_falsez1Don't shuffle files when processing the directory)destactionr   i'  )num_sentences)argparseArgumentParserr   common_argsadd_argumentset_defaults
parse_args)parserargs r   l/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/selftrain_wiki.pyr      s$   
r   c                 C   s   t j| st j| d dr| gS g }t }|tt j| d t	|dkrZ|
 }t j|rD|tt j|d nt j|d drT|| t	|dks,|  |S )zi
    Get a list of wiki files under the input_dir

    Recursively traverse the directory, then sort
       wiki_*r   )ospathisdirsplit
startswithr   extendglobjoinlenpopappendsort)	input_dir
wiki_filesrecursive_files	next_filer   r   r   list_wikipedia_files/   s   "
r+   c                 C   s  t | }| }W d   n1 sw   Y  g }g }t|}t|d}|dur||dr4t|d}n?|drL|rKt|dkrI|d| g }n'|dd}|dd}|	 }|
d	d
ksj|
dd
krld}|rs|| t|d}|dus)|r|d| |S )a	  
    Read the text from a wiki file as a list of paragraphs.

    Each <doc> </doc> is its own item in the list.
    Lines are separated by 

 to give hints to the stanza tokenizer.
    The first line after <doc> is skipped as it is usually the document title.
    Nz<docz</doc   z

z() z( )z&lt;r   z&gt; )open	readlinesiternextr   r#   r%   r"   replacestripfind)filenamefinlinesdocscurrent_docline_iteratorliner   r   r   read_wiki_fileF   s8   






r=   c               	   C   s  t  } td t| j}| jrt| t }tjd| j	|d}tj
| j	| j|d}t| jd}W d    n1 s<w   Y  t }t|ddD ]?}t|}tj|| j|||| jd}	||	 t| jd	}t|	D ]}
||
 |d
 qmW d    n1 sw   Y  qJd S )Ni  T)ssplitlangfoundation_cache)r@   wF)disable)r	   a
)r   randomseedr+   r'   r	   r   r   build_tag_piper?   build_parser_pipesmodelsr/   output_filesettqdmr=   find_matching_treesr   updatesortedwrite)r   r(   r@   tag_pipeparser_pipesfoutaccepted_treesr6   r9   	new_treestreer   r   r   maino   s0   




rW   __main__)__doc__r   collectionsr   r!   r   rE   %stanza.models.common.foundation_cacher   "stanza.utils.datasets.constituencyr   stanza.utils.get_tqdmr   rL   r   r+   r=   rW   __name__r   r   r   r   <module>   s"    )
