o
    hU                      @   s   d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlT dd	lmZmZmZ ed
ZG dd dZdS )z*
Class for running multilingual pipelines
    )OrderedDictN)UnionDocument)default_device)PipelineDownloadMethod)*)DEFAULT_MODEL_DIRget_language_resourcesload_resources_jsonstanzac                   @   sz   e Zd ZdZedddddddejdf
dededed	e	d
e	de
de
dededeeef fddZdd Zdd Zdd ZdS )MultilingualPipelinea  
    Pipeline for handling multilingual data. Takes in text, detects language, and routes request to pipeline for that
    language.

    You can specify options to individual language pipelines with the lang_configs field.
    For example, if you want English pipelines to have NER, but want to turn that off for French, you can do:
        lang_configs = {"en": {"processors": "tokenize,pos,lemma,depparse,ner"},
                        "fr": {"processors": "tokenize,pos,lemma,depparse"}}
        pipeline = MultilingualPipeline(lang_configs=lang_configs)

    You can also pass in a defaultdict created in such a way that it provides default parameters for each language.
    For example, in order to only get tokenization for each language:
    (remembering that the Pipeline will automagically add MWT to a language which uses MWT):
        from collections import defaultdict
        lang_configs = defaultdict(lambda: dict(processors="tokenize"))
        pipeline = MultilingualPipeline(lang_configs=lang_configs)

    download_method can be set as in Pipeline to turn off downloading
      of the .json config or turn off downloading of everything
    N@   
   F	model_dirlang_id_configlang_configsld_batch_sizemax_cache_sizeuse_gpurestrictdevicedownload_method
processorsc                 C   sd  || _ |d u r	i nt|| _|d u ri nt|| _|| _t | _|
d u r*d | _nt	|
t
r;dd |
dD | _nt|
| _|	| _d| jvrN| j| jd< | jD ]}d| j| vra|| j| d< qQ|rd| jvrt| j }|dkrztd ntd	| || jd< |d u r|d u s|d
krt }nd}|| _td| j dd| jd| j| _t| j | _d S )Nc                 S   s   g | ]}|  qS  )strip.0xr   r   W/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/multilingual.py
<listcomp>A   s    z1MultilingualPipeline.__init__.<locals>.<listcomp>,r   langlangid_lang_subsetr   z`MultilingualPipeline asked to restrict to lang_configs, but lang_configs was empty.  Ignoring...z&Restricting MultilingualPipeline to %sTcpumultilinguallangid)dirr#   r   r   r   )r   copydeepcopyr   r   r   r   pipeline_cachedefault_processors
isinstancestrsplitlistr   sortedkeysloggerwarningdebugr   r   r   lang_id_pipeliner   	resources)selfr   r   r   r   r   r   r   r   r   r   r#   known_langsr   r   r    __init__(   sD   




zMultilingualPipeline.__init__c                    s6  || j v r| j j|dd z| j| }W n ty&   d|i}|| j|< Y nw d|vr/||d< d|vr8| j|d< d|vrd| jrdt| j|  fdd| jD }|| jkr]t	d| j|| d	
||d< d
|vrm| j|d
< || j vrtd| t| j | jkr| j jdd tdd| ji| j| | j |< dS dS )z
        Do any necessary updates to the pipeline cache for this language. This includes building a new
        pipeline for the lang, and possibly clearing out a language with the old last access date.
        T)lastr#   r   r   c                    s   g | ]}| v r|qS r   r   r   lang_resourcesr   r    r!          z?MultilingualPipeline._update_pipeline_cache.<locals>.<listcomp>zENot all requested processors %s available for %s.  Loading %s insteadr"   r   z4Loading unknown language in MultilingualPipeline: %sFr(   Nr   )r+   move_to_endr   KeyErrorr   r,   r   r7   r3   infojoinr   r5   lenr   popitemr   r   )r8   r#   lang_configlang_processorsr   r<   r    _update_pipeline_cacheg   s6   




$z+MultilingualPipeline._update_pipeline_cachec                 C   s   t |t }|r|g}n|}|rt |d trdd |D }| j|}i }t|D ]\}}td||j |j|vr@g ||j< ||j 	| q*|
 D ]}| | | j| ||  qM|rd|d S |S )zz
        Run language detection on a string, a Document, or a list of either, route to language specific pipeline
        r   c                 S   s   g | ]}t g |d qS ))textr   )r   rH   r   r   r    r!      r>   z0MultilingualPipeline.process.<locals>.<listcomp>zLanguage for document %d: %s)r-   r0   r.   r6   process	enumerater3   r5   r#   appendr2   rG   r+   )r8   docsingleton_inputdocsdocs_w_langidlang_batchesdoc_idxr#   r   r   r    rI      s&   


zMultilingualPipeline.processc                 C   s   |  |}|S )N)rI   )r8   rL   r   r   r    __call__   s   
zMultilingualPipeline.__call__)__name__
__module____qualname____doc__r
   r   DOWNLOAD_RESOURCESr.   dictintboolr   r0   r:   rG   rI   rR   r   r   r   r    r      sH    	

?-%r   )rV   collectionsr   r)   loggingtypingr   stanza.models.common.docr   stanza.models.common.utilsr   stanza.pipeline.corer   r   stanza.pipeline._constantsstanza.resources.commonr
   r   r   	getLoggerr3   r   r   r   r   r    <module>   s    
