o
    h                       @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
T ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ edZdZeedG dd deZdS )z'
Processor for performing tokenization
    N)TokenizationDataset)Trainer)output_predictions)*)UDProcessorregister_processor)PROCESSOR_VARIANTS)doc)JiebaTokenizer)SpacyTokenizer)SudachiPyTokenizer)PyThaiNLPTokenizerstanzaz<UNK>)namec                   @   sB   e Zd ZeegZeg ZdZdd Zdd Z	dd Z
dd	 Zd
S )TokenizeProcessori  c                 C   s^   | dr	d | _n	t|d |d| _| d}|r"t|r"|| _d S |s)d | _d S td| )Npretokenized
model_path)
model_filedevicepostprocessorzfTokenizer recieved 'postprocessor' option of unrecognized type; postprocessor must be callable. Got %s)get_trainerr   callable_postprocessor
ValueError)selfconfigpipeliner   r    r   ]/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/tokenize_processor.py_set_up_model'   s   



zTokenizeProcessor._set_up_modelc           
      C   s   g }t |trdd | dD }nt |tr|}d}|D ]5}g }t|D ]'\}}|tj|d ftj	|tj
d| d|t|  i |t|d 7 }q'|| qdd	d |D }	|	|fS )
z
        Pretokenized text can be provided in 2 manners:

        1.) str, tokenized by whitespace, sentence split by newline
        2.) list of token lists, each token list represents a sentence

        generate dictionary data structure
        c                 S   s(   g | ]}t | d kr|  qS )r   )lenstripsplit.0sentr   r   r   
<listcomp>C   s   ( z@TokenizeProcessor.process_pre_tokenized_text.<locals>.<listcomp>
r      zstart_char=z
|end_char= c                 S   s   g | ]}d  |qS )r*   )join)r%   sentencer   r   r   r'   M   s    )
isinstancestrr"   r#   list	enumerateappendr	   IDTEXTMISCr!   r+   )
r   	input_srcdocument	sentencesidxr,   r&   token_idtokenraw_textr   r   r   process_pre_tokenized_text7   s   


6z,TokenizeProcessor.process_pre_tokenized_textc                 C   s  t |ts"t |tjs"| jds"| jdds"tdtt| t |tjr3| jdr0|S |j}| jdrF| 	|\}}t||S t
| drQ| j|S t |tr[d|n|}| jdtj}t| j|| jd| jjd	}t ' td | j|| jd ||| jdd| jd
d| jd
\}}}}W d    n1 sw   Y  |D ]}|D ]}t|d |krt|d< qqt||S )Nr   	no_ssplitFzIf neither 'pretokenized' or 'no_ssplit' option is enabled, the input to the TokenizerProcessor must be a string or a Document object.  Got %s_variant


max_seqlenT)
input_textvocab
evaluation
dictionarynum_workersr   )	orig_textr=   rE   r   text)r-   r.   r	   Documentr   r   r   typerG   r<   hasattrr>   processr/   r+   r   MAX_SEQ_LENGTH_DEFAULTr   rB   trainerrD   torchno_gradr   r   r!   TOKEN_TOO_LONG_REPLACEMENT)r   r6   r;   max_seq_lenbatches_r,   r:   r   r   r   rK   P   s<   0

	zTokenizeProcessor.processc                 C   s  t | dr| j|S | jdr+g }|D ]}| |j\}}|t	|| q|S d
dd |D }| tj	g |d}d}d }}	|D ]}
|	t|jk r{|j|	 jd j| t|
jkr{|	d	7 }	|	t|jk r{|j|	 jd j| t|
jks`|j||	 }||
_|D ]0}|
|_|jD ]'}| j|8  _| j|8  _|jr|jD ]}| j|8  _| j|8  _qqqt|dkr|d jd }|
j|jd
 }||_|d jd }|
jd
|j }||_tdd |D |
_tdd |D |
_|	}|t|
jd 7 }qG|S )aq  
        The tokenizer cannot use UDProcessor's sentence-level cross-document batching interface, and requires special handling.
        Essentially, this method concatenates the text of multiple documents with "

", tokenizes it with the neural tokenizer,
        then splits the result into the original Documents and recovers the original character offsets.
        r>   r   r?   c                 S   s   g | ]}|j qS r   rG   )r%   thisdocr   r   r   r'      s    z2TokenizeProcessor.bulk_process.<locals>.<listcomp>rT   r   r)   Nc                 s       | ]}t |jV  qd S N)r!   tokensr$   r   r   r   	<genexpr>       z1TokenizeProcessor.bulk_process.<locals>.<genexpr>c                 s   rW   rX   )r!   wordsr$   r   r   r   rZ      r[      )rJ   r>   bulk_processr   r   r<   rG   r1   r	   rH   r+   rK   r!   r7   rY   end_char_doc_start_char	_end_charr\   spaces_after
start_charspaces_beforesum
num_tokens	num_words)r   docsresr6   r;   combined_textprocessed_combined
charoffsetsentstsentenrU   r7   r&   r:   word
last_tokenlast_whitespacefirst_tokenfirst_whitespacer   r   r   r^   w   sR   
..

zTokenizeProcessor.bulk_processN)__name__
__module____qualname__setTOKENIZEPROVIDES_DEFAULTREQUIRES_DEFAULTrL   r    r<   rK   r^   r   r   r   r   r      s    
'r   )__doc__iologgingrN   stanza.models.tokenization.datar   "stanza.models.tokenization.trainerr    stanza.models.tokenization.utilsr   stanza.pipeline._constantsstanza.pipeline.processorr   r   stanza.pipeline.registryr   stanza.models.commonr	   stanza.pipeline.external.jiebar
   stanza.pipeline.external.spacyr   "stanza.pipeline.external.sudachipyr   "stanza.pipeline.external.pythainlpr   	getLoggerloggerrP   ry   r   r   r   r   r   <module>   s&    
