o
    h                     @   s   d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ e ZedZdd Zd	d
 Zdd Zdd ZedkrIe  dS dS )a  Builds a self-training dataset from an Italian data source and two models

The idea is that the top down and the inorder parsers should make
somewhat different errors, so hopefully the sum of an 86 f1 parser and
an 85.5 f1 parser will produce some half-decent silver trees which can
be used as self-training so that a new model can do better than either.

One dataset used is PaCCSS, which has 63000 pairs of sentences:

http://www.italianlp.it/resources/paccss-it-parallel-corpus-of-complex-simple-sentences-for-italian/

PaCCSS-IT: A Parallel Corpus of Complex-Simple Sentences for Automatic Text Simplification
  Brunato, Dominique et al, 2016
  https://aclanthology.org/D16-1034

Even larger is the IT section of Europarl, which has 1900000 lines

https://www.statmt.org/europarl/

Europarl: A Parallel Corpus for Statistical Machine Translation
  Philipp Koehn
  https://homepages.inf.ed.ac.uk/pkoehn/publications/europarl-mtsummit05.pdf
    N)FoundationCache)	selftrain)get_tqdmstanzac                  C   sv   t jdd} t|  | jdddd | jddd	d
dd | jdd | jdd | jdd | jdd |  }|S )NzAScript that converts a public IT dataset to silver standard trees)descriptionz--input_dirzextern_data/italianz-Path to the PaCCSS corpus and europarl corpus)defaulthelpz--no_europarlTstore_falseeuroparlzIUse the europarl dataset.  Turning this off makes the script a lot faster)r   actiondestr   it)langvit)packagezlsaved_models/constituency/it_best/it_vit_inorder_best.pt,saved_models/constituency/it_best/it_vit_topdown.pt)modelszdata/constituency/it_silver.mrg)output_file)argparseArgumentParserr   common_argsadd_argumentset_defaults
parse_args)parserargs r   j/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/selftrain_it.pyr   &   s,   
r   c                 C   s   t j| d}t|}| dd }W d   n1 sw   Y  dd |D }dd |D }dd |D }tdt|| |S )	zB
    Read the paccss dataset, which is two sentences per line
    zPaCCSS/data-set/PACCSS-IT.txt   Nc                 S      g | ]}|  qS r   strip.0xr   r   r   
<listcomp>I       zget_paccss.<locals>.<listcomp>c                 S   s"   g | ]}|r| d dd qS )	N   )splitr!   r   r   r   r$   J   s   " c                 S   s   g | ]	}|D ]}|qqS r   r   )r"   r#   yr   r   r   r$   K   s    zRead %d sentences from %s)ospathjoinopen	readlinesloggerinfolen)	input_dir
input_filefinlinestextr   r   r   
get_paccssA   s   
r7   c                 C   s   t j| d}t|}| dd }W d   n1 sw   Y  dd |D }dd |D }tdt|| t	||}|S )z`
    Read the Europarl dataset

    This dataset needs to be tokenized and split into lines
    zeuroparl/europarl-v7.it-en.itr   Nc                 S   r   r   r   r!   r   r   r   r$   Y   r%   z get_europarl.<locals>.<listcomp>c                 S   s   g | ]}|r|qS r   r   r!   r   r   r   r$   Z   r%   zRead %d docs from %s)
r*   r+   r,   r-   r.   r/   r0   r1   r   
split_docs)r2   ssplit_piper3   r4   r5   r   r   r   get_europarlO   s   
r:   c            	   
   C   s   t  } t }tjd| jd}tjd| j|d}tj| j| j| j|d}t	| j
}| jr4|t| j
| tdt| tj|| jt ||dd| jd}td	t|  t| jd
}t|D ]}|| |d q`W d   dS 1 sxw   Y  dS )zI
    Combine the two datasets, parse them, and write out the results
    T)ssplitr   F)r;   r   foundation_cache)r   r<   zProcessing %d docsd   )shuffle
chunk_size
output_ptbz7Found %d unique trees which are the same between modelsw
N)r   r   r   build_ssplit_piper   build_tag_pipebuild_parser_pipesr   r   r7   r2   r
   extendr:   r/   r0   r1   find_matching_treesnum_sentencessetr@   r-   r   sortedwrite)	r   r<   r9   tag_pipeparser_pipesdocs	new_treesfouttreer   r   r   main_   s"   
 
"rR   __main__)__doc__r   loggingr*   randomr   %stanza.models.common.foundation_cacher   "stanza.utils.datasets.constituencyr   stanza.utils.get_tqdmr   tqdm	getLoggerr/   r   r7   r:   rR   __name__r   r   r   r   <module>   s$    

