o
    h:	                     @   sf   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 dd Z
dd	 Zed
kr1e  dS dS )z
Split a constituency dataset randomly into 90/10 splits

TODO: add a function to rotate the pieces of the split so that each
training instance gets seen once
    N)tree_reader)copy_dev_test)get_default_pathsc                 C   s^   t j| d| }t|ddd}|D ]	}|d|  qW d    d S 1 s(w   Y  d S )N%s_train.mrgwzutf-8)encodingz%s
)ospathjoinopenwrite)	base_pathdataset_nametreesoutput_pathfouttree r   k/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/split_holdout.pywrite_trees   s   "r   c                  C   sl  t jdd} | jdtddd | jdtd dd | jd	td d
d | jdtddd | jdtddd |  }|jd u rI|jd |_t	d|j  |j
d u r[|jd |_
t	d|j
  t d }t||j|j t||j|j
 tj|d|j }t	d|  t|}g }g }t|j |D ]}t |jk r|| q|| qt||j| t||j
| d S )Nz[Split a standard dataset into 90/10 proportions of train so there is held out training data)descriptionz	--datasetid_iconzdataset to split)typedefaulthelpz--base_datasetzoutput name for base datasetz--holdout_datasetzoutput name for holdout datasetz--ratiog?zNumber of trees to hold outz--seedi  zRandom seedz-basez --base_dataset not set, using %sz-holdoutz#--holdout_dataset not set, using %sCONSTITUENCY_DATA_DIRr   z
Reading %s)argparseArgumentParseradd_argumentstrfloatint
parse_argsbase_datasetdatasetprintholdout_datasetr   r   r   r	   r
   r   read_tree_filerandomseedratioappendr   )parserargsr   
train_filer   
base_trainholdout_trainr   r   r   r   main   s8   



r1   __main__)__doc__r   r   r(   stanza.models.constituencyr   (stanza.utils.datasets.constituency.utilsr   stanza.utils.default_pathsr   r   r1   __name__r   r   r   r   <module>   s    '
