o
    h'                     @   sz   d dl Z d dlZd dlZd dlZd dlZd dlmZ 	 edZdd Z	dd Z
dd	 Zed
kr;eejdd  dS dS )    N)Counterz\n\s*\nc                 C   s6   ||  dkrt || }|rt|d}d|fS dS )zb Detect if a paragraph break can be found, and return the length of the paragraph break sequence. 
r   T)Fr   )PARAGRAPH_BREAKmatchlengroup)indextext
para_break	break_len r   g/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/prepare_tokenizer_data.pyis_para_break   s   r   c                 C   s  d}d}| t |k r|t |k rt| |\}}|r:t |dkr.td|s,J d|d}|d | |d 7 } n5td||  rQtd|| sQ|||  7 }n|||  7 }||  dd	|| kskJ d
||f |d7 }| d7 } | t |k r|t |k s| |fS )zu
    Locate the next word in the text. In case a paragraph break is found, also write paragraph break to labels.
    r    z^\s+$zSFound non-empty string at the end of a paragraph that doesn't match any token: |{}|z

   z^\s$r    zECharacter mismatch: raw text contains |%s| but the next word is |%s|.)r   r   rer   formatwritereplace)r   r	   wordoutputidx
word_sofarr
   r   r   r   r   find_next_word   s$   
 (r   c              
   C   s  t  }|jdtdd |jdtdd |jddd tdd	 |jd
dd tdd	 |j| d} t| jddd}d| }W d    n1 sHw   Y  t	|}| j
d u rZtj}ntj| j
d }tj|dd t| j
d}d}g }t| jddd}d}	d}
d}g }d}|D ]}| }t	|rF|d dkrt	|dkr|}q|d}d|d v rq|d }d|d v rdd |d dD \}
}|}g }nS|
t|d   kr|k rn n||g7 }qt|d |kr||g7 }dd |D }||t|fg7 }|d  r|d d  std|tjd  d}
d}d }qt	|	r)||	 t||||\}}d!t	|d  d|d vrBd"nd# }	qt	|	rlt|	d dksVJ ||	d d d$t|	d d   d}	d}qW d    n	1 szw   Y  d}| j
r|  d%| j
 }t|}| jd u rtd&| d S t| jd}tjt |! |d'd( W d    n	1 sw   Y  |d)t	|| j }t| d S )*Nplaintext_filez'Plaintext file containing the raw input)typehelpconllu_filez2CoNLL-U file containing tokens and sentence breaksz-oz--outputzFOutput file name; output to the console if not specified (the default))defaultr   r   z-mz--mwt_outputzYOutput file name for MWT expansions; output to the console if not specified (the default))argsrzutf-8)encodingr   r   T)exist_okw#	.r   -c                 S   s   g | ]}t |qS r   )int.0xr   r   r   
<listcomp>i       zmain.<locals>.<listcomp>c                 S   s   g | ]}|  qS r   )lowerr+   r   r   r   r.   q   r/   z0Sentence ID with potential wrong MWT expansion: )file013z{}z!Tokenizer labels written to %s
  zMWTs:   )indentz1{} unique MWTs found in data.  MWTs written to {})"argparseArgumentParseradd_argumentstr
parse_argsopenr   join	readlinesr   r   sysstdoutospathsplitmakedirsr   stripr*   tupleislowerprintstderrr   r   r   closer   
mwt_outputjsondumplistitems)r    parserfr	   textlenr   outdirr   mwt_expansionsbufmwtbeginmwtendexpandedlast_commentsliner   lastmwt
word_foundstatus_linemwtsr   r   r   main:   s   


 

 

(
(5
r_   __main__r   )r7   rL   rA   r   r?   collectionsr   compiler   r   r   r_   __name__argvr   r   r   r   <module>   s    
	\