o
    h                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddlm
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ e ZedZdddZdd Zedkr^e  dS dS )a  
Use the concepts in "Dataset Cartography" and "Mind Your Outliers" to find trees with the least variance over a training run

https://arxiv.org/pdf/2009.10795.pdf
https://arxiv.org/abs/2107.02331

The idea here is that high variance trees are more likely to be wrong in the first place.  Using this will filter a silver dataset to have better trees.

for example:

nlprun -d a6000 -p high "export CLASSPATH=/sailhome/horatio/CoreNLP/classes:/sailhome/horatio/CoreNLP/lib/*:$CLASSPATH; python3 stanza/utils/datasets/constituency/silver_variance.py --eval_file /u/nlp/data/constituency-parser/italian/2024_it_vit_electra/it_silver_0.mrg saved_models/constituency/it_vit.top.each.silver0.constituency_0*0.pt --output_file filtered_silver0.mrg" -o filter.out
    N)utils)FoundationCache)	retagging)tree_reader)run_dev_set)Trainer)retag_trees)EvaluateParser)get_tqdmzstanza.constituency.trainerc                 C   s   t jdd}|jdtd dd |jdtd dd |jdtd d	d |jd
td dd |jdtd dd t| |jdddd |jdtddd |jdtdd dd |jdtddd |jddddd  t	| t
| } t|  | S )!NziScript to filter trees by how much variance they show over multiple checkpoints of a parser training run.)descriptionz--eval_filezInput file for data loader.)typedefaulthelpz--output_filez&Output file after sorting by variance.z--charlm_forward_filez$Exact path to use for forward charlmz--charlm_backward_filez%Exact path to use for backward charlmz--wordvec_pretrain_filez'Exact name of the pretrain file to readz--langitzLanguage to use)r   r   z--eval_batch_size2   z)How many trees to batch when running evalmodels+zWhich model(s) to load)r   nargsr   r   z--keepg      ?z0How many trees to keep after sorting by variancez	--reverseF
store_truez&Actually, keep the high variance trees)r   actionr   )argparseArgumentParseradd_argumentstrr   add_device_argsintfloatr   add_retag_argsvars
parse_argspostprocess_args)argsparser r#   m/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/silver_variance.pyr   "   s    


r   c               	   C   s(  t  } t| }|r|d jnt }tdd| d   t| d }t	
dt| g }d }d}t |}| d D ]o}td|  tj|| |d	}	|d urr|d u rr|	jjd
 }
|	jjd }t	
d|
| d  t|||}t	
d g }tdt||D ]'}||||  }|r||||  nd }t|	j||| |\}}}|| q||| q:W d    n1 sw   Y  t|}tj|dd}tdd t|D | d d}tt|| d  }t| d ddd"}|d | D ]\}}|t||  |d qW d    d S 1 sw   Y  d S )Nr   z'Analyzing with the following models:
  z
  r   	eval_filezRead %d trees for analysisi  zStarting processing with %s)r!   foundation_cacheretag_method
retag_xposz8Retagging trees using the %s tags from the %s package...retag_packagezRetagging finished)axisc                 S   s   g | ]\}}||fqS r#   r#   ).0idxxr#   r#   r$   
<listcomp>c   s    zmain.<locals>.<listcomp>reverse)r/   keepoutput_filewzutf-8)encoding
)r   r   build_retag_pipeliner&   r   printjoinr   read_treebankloggerinfolenr	   r   loadmodelr!   r   ranger   extendappendnumpyarrayvarsorted	enumerater   openwriter   )r!   retag_pipeliner&   treebank
f1_historyretagged_treebank
chunk_size	evaluatormodel_filenametrainerr'   r(   current_historychunk_startchunkretagged_chunkf1kbestF1treeF1f1_variance	f1_sortednum_keepfout_r,   r#   r#   r$   main?   sL   


$r\   __main__)N)__doc__r   loggingrA   stanza.models.commonr   %stanza.models.common.foundation_cacher   stanza.models.constituencyr   r   *stanza.models.constituency.parser_trainingr   "stanza.models.constituency.trainerr    stanza.models.constituency.utilsr   stanza.server.parser_evalr	   stanza.utils.get_tqdmr
   tqdm	getLoggerr9   r   r\   __name__r#   r#   r#   r$   <module>   s(    

,
