o
    h                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ e ZedZdd
dZdddZedkr]e  dS dS )a  Use a Stanza tokenizer to turn a text file into one tokenized paragraph per line

For example, the output of this script is suitable for Glove

Currently this *only* supports tokenization, no MWT splitting.
It also would be beneficial to have an option to convert spaces into
NBSP, underscore, or some other marker to make it easier to process
languages such as VI which have spaces in them
    N)open_read_textdefault_device)TokenizationDataset)output_predictions)TokenizeProcessor)get_tqdmz\n\s*\n  c              	   C   s   |  }t|}ttdt||ddD ]H}t|| t|}||| }dd |D }	| |	}
|
D ](}t|j	D ]\}}|dkrH|
d |
ddd |jD  q;|
d	 q4qd S )
Nr   Fleavec                 S   s   g | ]	}t jg |d qS )text)stanzaDocument).0d r   d/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/tokenization/tokenize_files.py
<listcomp>&   s    z$tokenize_to_file.<locals>.<listcomp> c                 s   s    | ]}|j V  qd S Nr   )r   xr   r   r   	<genexpr>,   s    z#tokenize_to_file.<locals>.<genexpr>
)readNEWLINE_SPLIT_REsplittqdmrangelenminbulk_process	enumerate	sentenceswritejointokens)	tokenizerfinfout
chunk_sizeraw_text	documentschunk_start	chunk_endchunkin_docsout_docsdocumentsent_idxsentencer   r   r   tokenize_to_file    s   


r4   c                 C   s  t  }|jdtddd |jdtd dd |jdtdd	d
 |jdtddd |jdtd dd |jdtddd |j| d} tj| j	rPt
d| j	  d S | jrb| jdd}t|d t d}ntj| jd| jd}|jd }t| j	dddy}t| jD ]j}|drt|9}| }t|ddD ]&}	||d }
tj|
dd}
t||
| W d    n1 sw   Y  qW d    n1 sw   Y  qt|dd}
t||
| W d    n1 sw   Y  qW d    d S 1 sw   Y  d S ) Nz--langsdz&Which language to use for tokenization)typedefaulthelpz--tokenize_model_pathzSpecific tokenizer model to useinput_files+zWhich input files to tokenize)r6   nargsr8   z--output_filez	glove.txtz#Where to write the tokenized outputz--model_dirz?Where to get models for a Pipeline (None => default models dir)z--chunk_sizer   zHow many 'documents' to use in a chunk when tokenizing.  This is separate from the tokenizer batching - this limits how much memory gets used at once, since we don't need to store an entire file in memory at once)argsz6Cowardly refusing to overwrite existing output file %sF)
model_pathcheck_requirements)pipelinedevicetokenize)lang
processors	model_dirwzutf-8)encodingz.zipr	   r   )argparseArgumentParseradd_argumentstrint
parse_argsospathexistsoutput_fileprinttokenize_model_pathr   r   r   PipelinerB   rD   rC   openr   r9   endswithzipfileZipFilenamelistioTextIOWrapperr4   r   )r<   parserconfigr&   piper(   filenamezininput_names
input_namer'   r   r   r   main/   sL   

"rb   __main__)r   r   )__doc__rG   rY   rM   timererV   torchr   stanza.models.common.utilsr   r   stanza.models.tokenization.datar    stanza.models.tokenization.utilsr   "stanza.pipeline.tokenize_processorr   stanza.utils.get_tqdmr   r   compiler   r4   rb   __name__r   r   r   r   <module>   s*    


#
