o
    h                     @   s\   d Z ddlZddlZddlZdd Zdd Zdd ZdddZdd Ze	dkr,e  dS dS )a  
From a directory of files with VTB Trees, split into train/dev/test set
with a split of 70/15/15

The script requires two arguments
1. org_dir: the original directory obtainable from running vtb_convert.py
2. split_dir: the directory where the train/dev/test splits will be stored
    Nc                 C   s   t t| }t| |S )z
    This function creates the random order with which we use to loop through the files

    :param org_dir: original directory storing the files that store the trees
    :return: list of file names randomly shuffled
    )sortedoslistdirrandomshuffle)org_dir
file_names r	   g/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/vtb_split.pycreate_shuffle_list   s   
r   c                 C   s\   |sd}n	| ds|d }tj| d| }tj| d| }tj| d| }|||fS )z
    This function creates the necessary paths for the train/dev/test splits

    :param split_dir: directory that stores the splits
    :return: train path, dev path, test path
     _z%strain.mrgz	%sdev.mrgz
%stest.mrg)endswithr   pathjoin)	split_dir
short_name
train_pathdev_path	test_pathr	   r	   r
   create_paths   s   

r   c              	   C   st   d}|D ]3}| dsqtj| |}t|ddd}| }|D ]}|d7 }q!W d   n1 s2w   Y  q|S )z
    Function for obtaining the number of samples

    :param org_dir: original directory storing the tree files
    :param file_names: list of file names in the directory
    :return: number of samples
    r   .mrgrutf-8encoding   N)r   r   r   r   open	readlines)r   r   countfilenamefile_dirreadercontentliner	   r	   r
   get_num_samples/   s   

r%   ffffff?333333?c              	   C   sz  t j|dd || dkrtd|| t| }t||\}}}	t| |}
td|
|  t|
| }|| dkrO|
}||f}||f}td|||  n5|| dkrut|
||  }|||
f}|||	f}td||| |
|  nd	}|
f}|	f}td
|
 d	}g }|D ]:}|dsqt	t j
| |dd}| }dd |D }dd |D }|| W d    n1 sw   Y  q|d ur|d	 d	krt||d	  |d  }|}||| |d |  ||d   }t|}t||D ]B\}}t	|ddd.}||k r*t|d }|d u rtd|| |d |d7 }||k s	W d    n	1 s5w   Y  qd S )NT)exist_okg      ?z>Not making a test slice with the given ratios: train {} dev {}zFound {} total lines in {}zSplitting {} train, {} devg        z#Splitting {} train, {} dev, {} testr   zCopying all {} lines to testr   r   r   c                 S   s   g | ]}|  qS r	   )strip.0xr	   r	   r
   
<listcomp>s       zsplit_files.<locals>.<listcomp>c                 S   s   g | ]}|r|qS r	   r	   r*   r	   r	   r
   r-   t   r.   r   wz9Ran out of trees before reading all of the expected trees
)r   makedirsprintformatr   r   r%   intr   r   r   r   r   extendleniterzipnextRuntimeErrorwrite)r   r   r   
train_sizedev_sizerotationr   r   r   r   num_samples
stop_trainstop_devoutput_limitsoutput_namesr   treesr    r"   	new_treesrotation_startrotation_end	tree_iter
write_pathcount_limitwriter	next_treer	   r	   r
   split_filesF   sj   



$





rM   c                  C   sT   t jdd} | jddd | jddd |  }|j}|j}td t|| d	S )
z
    Main function for the script

    Process args, loop through each tree in each file in the directory
    and write the trees to the train/dev/test split with a split of
    70/15/15
    zHScript that splits a list of files of vtb trees into train/dev/test sets)descriptionr   zMThe location of the original directory storing correctly formatted vtb trees )helpr   z<The location of new directory storing the train/dev/test seti  N)	argparseArgumentParseradd_argument
parse_argsr   r   r   seedrM   )parserargsr   r   r	   r	   r
   main   s    
rW   __main__)Nr&   r'   N)
__doc__r   rP   r   r   r   r%   rM   rW   __name__r	   r	   r	   r
   <module>   s    	
C
