o
    h                     @   sr   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	T ddl
mZmZ eedG dd	 d	eZdS )
z-
Processor for determining language of text.
    NDocument)LangIDBiLSTM)*)UDProcessorregister_processor)namec                   @   s   e Zd ZdZeegZeg ZdZdd Z	dd Z
dd Zed	Zed
ZedZedZeeeegZedd Zdd Zdd Zdd ZdS )LangIDProcessorz/
    Class for detecting language of text.
    i  c                 C   sD   | dd}tj|d ||| dd| _| jj| _| d| _d S )N
batch_size@   
model_pathlang_subset)pathdevicer
   r   
clean_text)getr   load_modelchar_to_idx_char_index_clean_text)selfconfigpipeliner   r
    r   [/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/langid_processor.py_set_up_model   s   

zLangIDProcessor._set_up_modelc                    sP   t  j j}g }|D ]} fddt|D }|| qtj||tjdS )zX
        Map list of strings to batch tensor. Assumed all docs are same length.
        c                    s    g | ]} j | j d  qS )UNK)r   r   ).0cr   r   r   
<listcomp>.   s     z3LangIDProcessor._text_to_tensor.<locals>.<listcomp>)r   dtype)	nextr   
parametersr   listappendtorchtensorlong)r   docsr   all_docsdoc	doc_charsr   r    r   _text_to_tensor&   s   zLangIDProcessor._text_to_tensorc                    s"    j |} fdd|D }|S )zH
        Identify languages for each sequence in a batch tensor
        c                    s   g | ]} j j| qS r   )r   
idx_to_tag)r   
predictionr    r   r   r!   7   s    z-LangIDProcessor._id_langs.<locals>.<listcomp>)r   prediction_scores)r   batch_tensorpredictionsprediction_labelsr   r    r   	_id_langs2   s   zLangIDProcessor._id_langszhttps?:\/\/t\.co/[a-zA-Z0-9]+z@[a-zA-Z0-9_]+z
#[a-zA-Z]+z[!.]+c                 C   sD   t jD ]}|d| } qt| } tj| dd} |  r |  } | S )z
        Process text to improve language id performance. Main emphasis is on tweets, this method removes shortened
        urls, hashtags, handles, and punctuation and emoji.
         )replace)r	   all_regexessubemojiemojizereplace_emojistrip)textregexr   r   r   r   B   s   

zLangIDProcessor.clean_textc           	      C   s   t |dkrdS t|d trdd |D }i }|D ]#}| jr%t|jn|j}t |}||vr4g ||< || ||f q|D ]$}dd || D }| | 	|}t
|| |D ]	\}}||d _qZq@|S )zC
        Identify language of list of strings or Documents
        r   Nc                 S   s   g | ]}t g |qS r   r   )r   r>   r   r   r   r!   _   s    z1LangIDProcessor._process_list.<locals>.<listcomp>c                 S   s   g | ]}|d  qS )   r   )r   r,   r   r   r   r!   j   s    )len
isinstancestrr   r	   r   r>   r&   r5   r.   ziplang)	r   r*   docs_by_lengthr,   r>   
doc_lengthinputsr3   rE   r   r   r   _process_listT   s$   zLangIDProcessor._process_listc                 C   s   |g}|  |d S )z/
        Handle single str or Document
        r   rI   )r   r,   wrapped_docr   r   r   processq   s   zLangIDProcessor.processc                 C   s
   |  |S )z5
        Handle list of strings or Documents
        rJ   )r   r*   r   r   r   bulk_processy   s   
zLangIDProcessor.bulk_processN)__name__
__module____qualname____doc__setLANGIDPROVIDES_DEFAULTREQUIRES_DEFAULTMAX_SEQ_LENGTH_DEFAULTr   r.   r5   recompile
http_regexhandle_regexhashtag_regexpunctuation_regexr8   staticmethodr   rI   rL   rM   r   r   r   r   r	      s$    






r	   )rQ   r:   rW   stanzar'   stanza.models.common.docr   stanza.models.langid.modelr   stanza.pipeline._constantsstanza.pipeline.processorr   r   rS   r	   r   r   r   r   <module>   s    