o
    hJ	                     @   sZ   d Z ddlZddlmZ ddlmZ ddlmZmZ dd Z	eedG d	d
 d
eZ
dS )z.
Processors related to Jieba in the pipeline.
    N)doc)TOKENIZE)ProcessorVariantregister_processor_variantc                  C   s&   zddl } W dS  ty   tdw )zI
    Import necessary components from Jieba to perform tokenization.
    r   NzuJieba is used but not installed on your machine. Go to https://pypi.org/project/jieba/ for installation instructions.T)jiebaImportError)r    r   Y/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/external/jieba.pycheck_jieba   s   
r
   r   c                   @   s   e Zd Zdd Zdd ZdS )JiebaTokenizerc                 C   s:   |d dvr
t dt  ddl}|| _|dd| _dS )z Construct a Jieba-based tokenizer by loading the Jieba pipeline.

        Note that this tokenizer uses regex for sentence segmentation.
        lang)zhzzh-hanszzh-hantz[Jieba tokenizer is currently only allowed in Chinese (simplified or traditional) pipelines.r   N	no_ssplitF)	Exceptionr
   r   nlpgetr   )selfconfigr   r   r   r	   __init__   s   zJiebaTokenizer.__init__c           	      C   s   t |tjr
|j}n|}t |tstd| jj|dd}g }g }d}|D ]A}t	d|r4|t
|7 }q%tj|tjtj d| dtj d|t
|  i}|| |t
|7 }| jsf|dv rf|| g }q%t
|dkrr|| t||S )	z^ Tokenize a document with the Jieba tokenizer and wrap the results into a Doc object.
        zFMust supply a string or Stanza Document object to the Jieba tokenizer.F)cut_allr   z\s+=|)u   。u   ！u   ？!?)
isinstancer   Documenttextstrr   r   cutrematchlenTEXTMISC
START_CHAREND_CHARappendr   )	r   documentr   tokens	sentencescurrent_sentenceoffsettokentoken_entryr   r   r	   process&   s2   
(


zJiebaTokenizer.processN)__name__
__module____qualname__r   r.   r   r   r   r	   r      s    r   )__doc__r   stanza.models.commonr   stanza.pipeline._constantsr   stanza.pipeline.processorr   r   r
   r   r   r   r   r	   <module>   s    