o
    h#                     @   sx   d Z ddlZddlZddlZddlmZ ddlZddlZddlZddl	m	Z	 dgZ
dd Zdd	 Zed
kr:e  dS dS )a\  
Create Stanza character LM train/dev/test data, by reading from txt files in each source corpus directory,
shuffling, splitting and saving into multiple smaller files (50MB by default) in a target directory.

This script assumes the following source directory structures:
    - {src_dir}/{language}/{corpus}/*.txt
It will read from all source .txt files and create the following target directory structures:
    - {tgt_dir}/{language}/{corpus}
and within each target directory, it will create the following files:
    - train/*.txt
    - dev.txt
    - test.txt
Args:
    - src_root: root directory of the source.
    - tgt_root: root directory of the target.
    - langs: a list of language codes to process; if specified, languages not in this list will be ignored.
Note: edit the {EXCLUDED_FOLDERS} variable to exclude more folders in the source directory.
    N)Path)tqdm
raw_corpusc            
   
      s  t  } | jdddd | jdddd | jdd	d
d | jdd	dd | jdddddd | jddtdd | jdddddd |  }td td|j  td|j  td	 g t|j	dkrt|j	
dtdt  g t|jdkr|j
dtd t  t|jt|j}t}d!d" |D }fd#d"|D }tdkrfd$d"|D }tt| d% t| td	 t|jd& d& }|D ]m}|  t }tdkrfd'd"|D } fd(d"|D }tt| d)| d* t| td	 |D ]/} | }|| | }	tj|	s&t|	 td+| d,|  t||	|||j||j qtd	 qd S )-Nsrc_rootsrczRoot directory with all source files.  Expected structure is root dir -> language dirs -> package dirs -> text files to process)defaulthelptgt_roottgtz%Root directory with all target files.z--langs zaA list of language codes to process.  If not set, all languages under src_root will be processed.z
--packageszeA list of packages to process.  If not set, all packages under the languages found will be processed.z--no_xz_outputT	xz_outputstore_falsezOutput compressed xz files)r   destactionr   z--split_size2   z#How large to make each split, in MB)r   typer   z--no_make_test_filemake_test_filezoDon't save a test file.  Honestly, we never even use it.  Best for low resource languages where every bit helpszProcessing files:zsource root: ztarget root: r   ,z)Only processing the following languages: z(Only processing the following packages: c                 S   s   g | ]}|t vr|qS  )EXCLUDED_FOLDERS.0lr   r   [/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/charlm/make_lm_data.py
<listcomp>>       zmain.<locals>.<listcomp>c                        g | ]}t j | r|qS r   ospathisdirr   )r   r   r   r   ?        c                       g | ]}| v r|qS r   r   r   )langsr   r   r   A   r   z total languages found:   c                    r"   r   r   r   d)packagesr   r   r   L   r   c                    r   r   r   r%   )	lang_rootr   r   r   M   r!   z! total corpus found for language .z-> Processing -)argparseArgumentParseradd_argumentint
parse_argsprintr   r	   lenr#   splitstrr'   r   r   listdir
split_sizer   existsmakedirsprepare_lm_datar   r   )
parserargsr	   	lang_dirsr5   lang	data_dirsdataset_namesrc_dirtgt_dirr   )r(   r#   r'   r   r   main    sd   





rA   c                 C   s  t | tsJ t |tsJ tj|d}tj|| d| d}td| d tt	| d tt	| d  tt	| d  }	t
|	D ]F}
|
d	rad
|
 d| }tj|dd qJ|
drvd|
 d| }tj|dd qJ|
drd|
 d| }tj|dd qJtd|
 tj|| d| d}td| d d
| d| }tj|dd}|jdkrtdtj|d d d }td|dd |dk rtdtd| d |d  }tj|st| d!| d"| d#| d$| d| d}tj|dd}|jdkrtd%tt| d}td&| d' |d(k r2td)| | d*}| d+}|rntd, t| d$| d| d-| t| d$| d| d.| ||gt| d }ntd/ t| d$| d| d-| |gt| d }|rtd0 t
|D ]
}td1|g qtd2 W d3   n	1 sw   Y  td4| d| d5 d3S )6z\
    Combine, shuffle and split data into smaller files, following a naming convention.
    )dirr*   z.tmpz--> Copying files into z...z/*.txtz	/*.txt.xzz	/*.txt.gzz.txtzcat z >> T)shellz.txt.xzzxzcat z.txt.gzzzcat zshould not have found %sz.tmp.shuffledz--> Shuffling files into z
 | shuf > r   zFailed to shuffle files!r$   z--> Shuffled file size: z.4fz GBg?zFNot enough data found to build a charlm.  At least 100MB data expectedz)--> Splitting into smaller files of size z ...trainz	split -C z" -a 4 -d --additional-suffix .txt  /zFailed to split files!z--> z total files generated.   zTSomething went wrong!  %d file(s) produced by shuffle and split, expected at least 3z/dev.txtz	/test.txtz"--> Creating dev and test files...z	-0000.txtz	-0001.txtz--> Creating dev file...z--> Compressing files...xzz--> Cleaning up...Nz--> All done for z.
)
isinstancer   tempfileTemporaryDirectoryr   r   joinr0   globr3   r   endswith
subprocessrunAssertionError
returncodeRuntimeErrorgetsizer6   r7   r1   shutilmove)r?   r@   r<   r>   compressr5   r   tempdirtgt_tmpinput_filessrc_fncmdtgt_tmp_shuffledresultsize	train_dirtotaldev_file	test_file	txt_filestxt_filer   r   r   r8   ]   sp   6




$



<r8   __main__)__doc__r+   rM   r   pathlibr   rU   rO   rJ   r   r   rA   r8   __name__r   r   r   r   <module>   s    =D
