o
    –hA
  ã                   @   s^   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 dd„ Z
edkr-e
ƒ  dS dS )	zÐ
Randomly split a file into train, dev, and test sections

Specifically used in the case of building a tagger from the initial
POS tagging provided by Isra, but obviously can be used to split any
conllu file
é    N©ÚDocument)ÚCoNLL)Úget_default_pathsc                  C   sÚ  t  ¡ } | jdddd | jdtddd | jd	td
dd | jdtd
dd | jdddd | jdddd | jdddddd | jdddddd | jdtƒ d dd |  ¡ }|j|j|jf}t	 
|j¡}t |j¡ g g f}g g f}g g f}|||g}|jD ]9}| ¡ }	|jr–|	D ]}
|
 d d ¡ q|jr¤|	D ]}
|
 d!d ¡ q›t ||¡d" }|d"  |	¡ |d#  |j¡ q‚d$d%„ |D ƒ}t|d&ƒD ]"\}}tj |jd'|j|f ¡}td(t|jƒ|f ƒ t	 ||¡ qÈd S ))Nz
--filenamez*extern_data/sindhi/upos/sindhi_upos.conlluzWhich file to split)ÚdefaultÚhelpz--traingš™™™™™é?z%Fraction of the data to use for train)Útyper   r   z--devgš™™™™™¹?z#Fraction of the data to use for devz--testz$Fraction of the data to use for testz--seedÚ1234zRandom seed to usez--short_nameÚsd_israz-Dataset name to use when writing output filesz--no_remove_xposTÚstore_falseÚremove_xposz/By default, we remove the xpos from the dataset)r   ÚactionÚdestr   z--no_remove_featsÚremove_featsz0By default, we remove the feats from the datasetz--output_directoryÚPOS_DATA_DIRzWhere to put the split conlluÚxposÚfeatsr   é   c                 S   s    g | ]}t |d  |d d‘qS )r   r   )Úcommentsr   )Ú.0Úsplit© r   úd/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/random_split_conllu.pyÚ
<listcomp>3   s     zmain.<locals>.<listcomp>)ÚtrainÚdevÚtestz%s.%s.in.conlluzOutputting %d sentences to %s)ÚargparseÚArgumentParserÚadd_argumentÚfloatr   Ú
parse_argsr   r   r   r   Ú	conll2docÚfilenameÚrandomÚseedÚ	sentencesÚto_dictr   Úpopr   ÚchoicesÚappendr   ÚzipÚosÚpathÚjoinÚoutput_directoryÚ
short_nameÚprintÚlenÚwrite_doc2conll)ÚparserÚargsÚweightsÚdocÚ	train_docÚdev_docÚtest_docÚsplitsÚsentenceÚsentence_dictÚxr   Ú	split_docÚ
split_namer#   r   r   r   Úmain   sF   

ýrA   Ú__main__)Ú__doc__r   r,   r$   Ústanza.models.common.docr   Ústanza.utils.conllr   Ústanza.utils.default_pathsr   rA   Ú__name__r   r   r   r   Ú<module>   s    (
ÿ