o
    he                     @   sZ   d Z ddlZddlmZ ddlmZ ddlmZmZ dd Z	eedG d	d
 d
eZ
dS )zo
Processors related to SudachiPy in the pipeline.

GitHub Home: https://github.com/WorksApplications/SudachiPy
    N)doc)TOKENIZE)ProcessorVariantregister_processor_variantc                  C   s.   zddl } ddl}W dS  ty   tdw )zM
    Import necessary components from SudachiPy to perform tokenization.
    r   NzBoth sudachipy and sudachidict_core libraries are required. Try install them with `pip install sudachipy sudachidict_core`. Go to https://github.com/WorksApplications/SudachiPy for more information.T)	sudachipysudachidict_coreImportError)r   r    r	   ]/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/external/sudachipy.pycheck_sudachipy   s   
r   r   c                   @   s   e Zd Zdd Zdd ZdS )SudachiPyTokenizerc                 C   sR   |d dkr
t dt  ddlm} ddlm} |  | _|dd| _d	S )
zx Construct a SudachiPy-based tokenizer.

        Note that this tokenizer uses regex for sentence segmentation.
        langjaz:SudachiPy tokenizer is only allowed in Japanese pipelines.r   )	tokenizer)
dictionary	no_ssplitFN)		Exceptionr   r   r   r   
Dictionarycreategetr   )selfconfigr   r   r	   r	   r
   __init__   s   zSudachiPyTokenizer.__init__c                 C   s   t |tjr
|j}n|}t |tstd| j|}g }g }|D ];}| }|	 r,q!|
 }| }	tj|tjtj d| dtj d|	 i}
||
 | js\|dv r\|| g }q!t|dkrh|| t||S )zb Tokenize a document with the SudachiPy tokenizer and wrap the results into a Doc object.
        zJMust supply a string or Stanza Document object to the SudachiPy tokenizer.=|)u   。u   ！u   ？!?r   )
isinstancer   Documenttextstrr   r   tokenizesurfaceisspacebeginendTEXTMISC
START_CHAREND_CHARappendr   len)r   documentr   tokens	sentencescurrent_sentencetoken
token_textstartr%   token_entryr	   r	   r
   process-   s2   
 


zSudachiPyTokenizer.processN)__name__
__module____qualname__r   r4   r	   r	   r	   r
   r      s    r   )__doc__restanza.models.commonr   stanza.pipeline._constantsr   stanza.pipeline.processorr   r   r   r   r	   r	   r	   r
   <module>   s    