o
    h                     @   sR   d Z ddlmZ ddlmZ ddlmZmZ dd ZeedG dd	 d	eZ	d
S )zg
Processors related to PyThaiNLP in the pipeline.

GitHub Home: https://github.com/PyThaiNLP/pythainlp
    )doc)TOKENIZE)ProcessorVariantregister_processor_variantc                  C   s&   zddl } W dS  ty   tdw )zM
    Import necessary components from pythainlp to perform tokenization.
    r   NzThe pythainlp library is required. Try to install it with `pip install pythainlp`. Go to https://github.com/PyThaiNLP/pythainlp for more information.T)	pythainlpImportError)r    r   ]/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/external/pythainlp.pycheck_pythainlp   s   
r
   r   c                   @   s   e Zd Zdd Zdd ZdS )PyThaiNLPTokenizerc                 C   sP   |d dkr
t dt  ddlm} ddlm} || _|| _|dd| _d	S )
a   Construct a PyThaiNLP-based tokenizer.

        Note that we always uses the default tokenizer of PyThaiNLP for sentence and word segmentation.
        Currently this is a CRF model for sentence segmentation and a dictionary-based model (newmm) for word segmentation.
        langthz5PyThaiNLP tokenizer is only allowed in Thai pipeline.r   )sent_tokenize)word_tokenize	no_ssplitFN)		Exceptionr
   pythainlp.tokenizer   r   pythai_sent_tokenizepythai_word_tokenizegetr   )selfconfigr   r   r   r   r	   __init__   s   zPyThaiNLPTokenizer.__init__c           
      C   s   t |tjr
|j}n|}t |tstdg }g }d}| jr"|g}n| j|dd}|D ]B}| j|ddD ]1}|	 rA|t
|7 }q4tj|tjtj d| dtj d|t
|  i}	||	 |t
|7 }q4|| g }q+t
|dkry|| t||S )zb Tokenize a document with the PyThaiNLP tokenizer and wrap the results into a Doc object.
        zJMust supply a string or Stanza Document object to the PyThaiNLP tokenizer.r   crfcut)enginenewmm=|)
isinstancer   Documenttextstrr   r   r   r   isspacelenTEXTMISC
START_CHAREND_CHARappend)
r   documentr    	sentencescurrent_sentenceoffset	sent_strssent_str	token_strtoken_entryr   r   r	   process,   s4   
(


zPyThaiNLPTokenizer.processN)__name__
__module____qualname__r   r1   r   r   r   r	   r      s    r   N)
__doc__stanza.models.commonr   stanza.pipeline._constantsr   stanza.pipeline.processorr   r   r
   r   r   r   r   r	   <module>   s    