o
    h                     @   s`   d Z ddlZddlZddlmZ ddlmZ ddlT ddlm	Z	m
Z
 e
eG dd de	ZdS )	z5
Processor for performing multi-word-token expansion
    N)
DataLoader)Trainer)*)UDProcessorregister_processorc                       sH   e Zd ZeegZeegZdd Zdd Z	dd Z
 fddZ  ZS )	MWTProcessorc                 C   s   t |d |d| _d S )N
model_path)
model_filedevice)r   _trainer)selfconfigpipeliner
    r   X/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/mwt_processor.py_set_up_model   s   zMWTProcessor._set_up_modelc                 C   s   t || jd | j| jdddS )N
batch_sizeT)vocab
evaluationexpand_unk_vocab)r   r   r   )r   documentr   r   r   build_batch   s   zMWTProcessor.build_batchc                 C   s   |  |}|jjdd}t|dkrZ| jd r| j|}n>t ! g }t	|
 D ]\}}|| jj|d|jd7 }q+W d    n1 sFw   Y  | jddrY| j||}ng }|jj|dd |jS )	NT)r   r   	dict_only)never_decode_unkr   ensemble_dictF)process_manual_expanded)r   docget_mwt_expansionslenr   trainerpredict_dicttorchno_grad	enumerate	to_loaderpredictr   getensembleset_mwt_expansions)r   r   batch
expansionspredsibr   r   r   process   s"   


zMWTProcessor.processc                    s"   t  |}|D ]}|  q|S )zx
        MWT processor counts some statistics on the individual docs, so we need to separately redo those stats
        )superbulk_process_count_words)r   docsr   	__class__r   r   r0   4   s   
zMWTProcessor.bulk_process)__name__
__module____qualname__setMWTPROVIDES_DEFAULTTOKENIZEREQUIRES_DEFAULTr   r   r.   r0   __classcell__r   r   r3   r   r      s    

r   )__doc__ior!   stanza.models.mwt.datar   stanza.models.mwt.trainerr   stanza.pipeline._constantsstanza.pipeline.processorr   r   r9   r   r   r   r   r   <module>   s    