o
    hCa                     @   s4  d Z ddlZddlZddlmZ ddlZddlZddlZddlZddl	Z	ddl
Z
ddlT ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z? ddl@mAZAmBZB ddlCmDZD eEdZFG dd deZGG dd deHZIG dd deHZJG d d! d!eKZLG d"d# d#eMZNd$d% ZOd&d' ZPd(d) ZQd*d+ ZRG d,d- d-ZSd.d/ ZTeUd0kreT  dS dS )1z4
Pipeline that runs tokenize,mwt,pos,lemma,depparse
    N)Enum)*)langcode_to_lang)Document)FoundationCache)default_device)	ProcessorProcessorRequirementsException)NAME_TO_PROCESSOR_CLASSPIPELINE_NAMESPROCESSOR_VARIANTS)LangIDProcessor)TokenizeProcessor)MWTProcessor)POSProcessor)LemmaProcessor)ConstituencyProcessor)CorefProcessor)DepparseProcessor)SentimentProcessor)NERProcessor)DEFAULT_MODEL_DIRDEFAULT_RESOURCES_URLDEFAULT_RESOURCES_VERSIONModelSpecificationadd_dependenciesadd_mwtdownload_modelsdownload_resources_jsonflatten_processor_listload_resources_jsonmaintain_processor_listprocess_pipeline_parametersset_logging_levelsort_processors)PACKAGES)CoNLL
CoNLLError)
make_tablestanzac                   @   s   e Zd ZdZdZdZdZdS )DownloadMethoda  
    Determines a couple options on how to download resources for the pipeline.

    NONE will not download anything, including HF transformers, probably resulting in failure if the resources aren't already in place.
    REUSE_RESOURCES will reuse the existing resources.json and models, but will download any missing models.
    DOWNLOAD_RESOURCES will download a new resources.json and will overwrite any out of date models.
             N)__name__
__module____qualname____doc__NONEREUSE_RESOURCESDOWNLOAD_RESOURCES r5   r5   O/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/pipeline/core.pyr*   '   s
    r*   c                          e Zd Z fddZ  ZS )LanguageNotDownloadedErrorc              	      s6   t  d| d| d| d || _|| _|| _d S )NzCould not find the model file z .  The expected model directory z7 is missing.  Perhaps you need to run stanza.download("z"))super__init__langlang_dir
model_path)selfr;   r<   r=   	__class__r5   r6   r:   4   s    
z#LanguageNotDownloadedError.__init__r.   r/   r0   r:   __classcell__r5   r5   r?   r6   r8   3       r8   c                       r7   )UnsupportedProcessorErrorc              	      s0   t  d| d| d| d || _|| _d S )Nz
Processor z is not known for language z:.  If you have created your own model, please specify the z1_model_path parameter when creating the pipeline.)r9   r:   	processorr;   )r>   rE   r;   r?   r5   r6   r:   ;   s    
z"UnsupportedProcessorError.__init__rA   r5   r5   r?   r6   rD   :   rC   rD   c                       r7   )IllegalPackageErrorc                    s   t  | d S N)r9   r:   )r>   msgr?   r5   r6   r:   A   s   zIllegalPackageError.__init__rA   r5   r5   r?   r6   rF   @   rC   rF   c                   @   s4   e Zd ZdZdd Zedd Zdd Zdd	 Zd
S )PipelineRequirementsExceptionz
    Exception indicating one or more requirements failures while attempting to build a pipeline.
    Contains a ProcessorRequirementsException list.
    c                 C   s   || _ |   d S rG   )_processor_req_failsbuild_message)r>   processor_req_failsr5   r5   r6   r:   J   s   z&PipelineRequirementsException.__init__c                 C      | j S rG   )rJ   r>   r5   r5   r6   rL   N   s   z1PipelineRequirementsException.processor_req_failsc                 C   s6   t  }tdd | jD d|d d|  | _d S )Nc                 S      g | ]}|j qS r5   message).0req_failr5   r5   r6   
<listcomp>T       z?PipelineRequirementsException.build_message.<locals>.<listcomp>
)sepfilez

)ioStringIOprintrL   getvaluerQ   )r>   err_msgr5   r5   r6   rK   R   s   z+PipelineRequirementsException.build_messagec                 C   rM   rG   rP   rN   r5   r5   r6   __str__W   s   z%PipelineRequirementsException.__str__N)	r.   r/   r0   r1   r:   propertyrL   rK   r^   r5   r5   r5   r6   rI   D   s    
rI   c                 C   s   t dd | D r&t| dkrtdtj| d j d| d j dfS t dd | D r@t| dkr9td	t d
dfS dS )z
    Build a config option for a couple situations: lemma=identity, processor is a variant

    Returns the option name and value

    Refactored from build_default_config so that we can reuse it when
    downloading all models
    c                 s   s     | ]}|j t|j v V  qd S rG   )packager   rE   rR   
model_specr5   r5   r6   	<genexpr>d   s    z.build_default_config_option.<locals>.<genexpr>r+   zBVariant processor selected for {}, but multiple packages requestedr   _with_Tc                 s   s$    | ]}|j tko|jd kV  qdS )identityN)rE   LEMMAr`   ra   r5   r5   r6   rc   i   s   " zFIdentity processor selected for lemma, but multiple packages requested_use_identityN)anylenrF   formatrb   rE   r`   rf   model_specsr5   r5   r6   build_default_config_optionZ   s   
rm   c                 C   s   dd | D S )Nc                 S   s$   g | ]\}}t |d u r||fqS rG   )rm   )rR   keyvaluer5   r5   r6   rT   p      $ z#filter_variants.<locals>.<listcomp>r5   rk   r5   r5   r6   filter_variantso   s   rq   c              
      sR  i }|D ]\}t |}|d ur|d ||d < q fdd|D }dd |D }tkrm|| d< g }	|D ]*}
|
sD|	i  q:|	i  |
D ]}|\}}tj ||d |	d | d	< qKq:|	| d
< qt|dkrztd|d | d< |d sq|d D ]}|\}}tj ||d | d| d	< qq|S )Nr+   r   c              	      s$   g | ]}t j |jd  qS ).pt)ospathjoinr`   ra   r;   	model_dirrE   r5   r6   rT   }   rp   z(build_default_config.<locals>.<listcomp>c                 S   rO   r5   )dependenciesra   r5   r5   r6   rT   ~   rU   _model_pathrr   _path_dependencieszLSpecified multiple packages for {}, which currently only handles one package_)	rm   NERappendrs   rt   ru   ri   rF   rj   )	resourcesr;   rw   	load_listdefault_configrl   optionmodel_pathsrx   dependency_pathsdependency_block
dependencydep_processor	dep_modelr5   rv   r6   build_default_configs   sB   


&
r   c              
   C   sR   | du rt jS t| tr'zt |   W S  ty& } ztd|  |d}~ww | S )zM
    Turn None -> DownloadMethod.NONE, strings to the corresponding enum
    NzUnknown download method %s)r*   r2   
isinstancestrupperKeyError
ValueError)download_methoder5   r5   r6   normalize_download_method   s   
r   c                   @   s   e Zd Zdedi ddddejededddddfddZe	dd Z
e	d	d
 Zedd ZdddZdd ZdddZdd ZdddZdS )PipelineendefaultNFc                     s.   ||| _ | _| _|d ur|tkr|| _t|| t|	| _| jtju s8| jtj	u rHt
jt
j| jdsHtd t| j|
||||d |d u rWt| jtju d| _nt|| jtju d| _t | j||\ | _}}td t| j| v rd  v rtd  d  d  d   d  d	  v r  d	 nd
}n|rtd  t }n
td  t } v rt |||d d| _t | j| _| jtjur fdd| jD }t|}t|}t| | j||dd n|rdd t |! D | _ng | _| "|| j| _t#| jdkrC vs1t$  vr<t%d  d  dt%d& t'ddgdd | jD }td  d| d|  t( | j| j| _)| j)*| i | _+ dd }|d u r|d u s|d!krt, }nd"}|d!kr|d"krtd# || _-td$&| j- g }| jD ]}|\}}td%|  | .|| j)}|*| d&| j)v rd&|vr| j)d& |d&< td' t| zt/| || | j-d(| j+|< W q t0y } z|1| |j2| j+|< W Y d }~qd }~w t3y } zhd)|v r~|d) }|j4|ks4t5|t6t fr7|j4|v r7|j4}t
j7|\}}t
j8|}|rVt
j|sVt9 ||||  vrct:| |t
j|s~t
j;|\}}t3d*|  ||f | d }~ww |rtd+ t<|td, d S )-Nzresources.jsonzChecking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES)resources_urlresources_branchresources_versionresources_filepathproxies)local_files_onlyzLoading resource file...alias"z" is an alias for "	lang_name z6Trying to create pipeline for unsupported language: %szeUnsupported language: %s  If trying to add a new language, consider using allow_unknown_language=Truetokenize_pretokenized)maybe_add_mwtc                    s$   g | ]}|d    i v r|qS )r   )getrR   xr;   r   r5   r6   rT      rp   z%Pipeline.__init__.<locals>.<listcomp>F)r   r;   rw   r   r   log_infoc                 S   s    g | ]}|t |d ddgfqS )r   NrE   r`   rx   r   )rR   procr5   r5   r6   rT     s    r   z#No processors to load for language z.  Language z is currently unsupportedzbNo processors to load for language {}.  Please check if your language or package is correctly set.r   Packagec                 S   s,   g | ]}|d  d dd |d D fqS )r   ;c                 s   s    | ]}|j V  qd S rG   )r`   ra   r5   r5   r6   rc     s    z/Pipeline.__init__.<locals>.<listcomp>.<genexpr>r+   )ru   )rR   rowr5   r5   r6   rT        , z#Loading these models for language: z (z):
predict)r;   modeTcpuz$GPU requested, but is not available!zUsing device: {}z	Loading: 	pretaggedzWith settings: )configpipelinedevicer=   zCould not find model file %s, although there are other models downloaded for language %s.  Perhaps you need to download a specific model.  Try: stanza.download(lang="%s",package=None,processors={"%s":"%s"})rV   zDone loading processors!)=r;   dirkwargsr   r#   r   r   r*   r4   r3   rs   rt   existsru   loggerinfor   r   r2   foundation_cacher"   debugr    warningr   r!   r   r   r   rq   r   r   listkeysupdate_kwargsri   r%   r   rj   r(   r   r   update
processorsr   r   filter_configr
   r	   r   err_processorFileNotFoundErrorfilenamer   tuplesplitdirnamer8   rD   splitextrI   ) r>   r;   r   r`   r   logging_levelverboseuse_gpurw   r   r   r   r   r   r   r   r   allow_unknown_languager   r   download_list
load_tablepipeline_level_configspipeline_reqs_exceptionsitemprocessor_namer}   curr_processor_configr   r=   
model_namer<   r5   r   r6   r:      s   

$
	
 












(
zPipeline.__init__c           
         s   dd |D  |   D ]M\}}|dd}t|dkrq|\}}|dkrXt|dk r,|n|d d d |d	d   } |g }t|d
krN|d
 d}	nd }	||	dg |< q fdd D }t|}|S )Nc                 S   s    i | ]\}}|d d |D qS )c                 S   s   g | ]	}|j |jd qS )r`   rx   r   ra   r5   r5   r6   rT   `  s    z5Pipeline.update_kwargs.<locals>.<dictcomp>.<listcomp>r5   )rR   rE   rl   r5   r5   r6   
<dictcomp>`  s    z*Pipeline.update_kwargs.<locals>.<dictcomp>r}   r+   r=      
   z...ir   rx   r   c                    s&   g | ]   fd d  D fqS )c                    s"   g | ]}t  |d  |d dqS )r`   rx   r   r   ra   rE   r5   r6   rT   o     " z5Pipeline.update_kwargs.<locals>.<listcomp>.<listcomp>r5   )rR   processor_dictr   r6   rT   o     & z*Pipeline.update_kwargs.<locals>.<listcomp>)itemsr   ri   r   r$   )
r   processor_listrn   ro   pieceskvr`   original_specrx   r5   r   r6   r   ^  s&   ,zPipeline.update_kwargsc                 C   sL   i }|  D ]}|dd}t|dkrq|\}}|| kr#|| ||< q|S )Nr}   r+   )r   r   ri   )prefixconfig_dictfiltered_dictrn   r   r   r   r5   r5   r6   r   s  s   zPipeline.filter_configc                    s    fddt D S )zy
        Return all currently loaded processors in execution order.
        :return: list of Processor instances
        c                    s"   g | ]} j |r j | qS r5   )r   r   )rR   r   rN   r5   r6   rT     r   z.Pipeline.loaded_processors.<locals>.<listcomp>)r   rN   r5   rN   r6   loaded_processors  s   zPipeline.loaded_processorsc                    s>  t t|tt|tt|tgsJ dt|tr!t|dkr!g S t|to2t|dko2t|d t} du r:t nFt ttttfsLt	d
t t tr\dd  dD  nt  t v rwt| jv rwt vrwtd  t  fd	d
tD   D ]}| j|r|r| j| jn| j| j}||}q|S )a  
        Run the pipeline

        processors: allow for a list of processors used by this pipeline action
          can be list, tuple, set, or comma separated string
          if None, use all the processors this pipeline knows about
          MWT is added if necessary
          otherwise, no care is taken to make sure prerequisites are followed...
            some of the annotators, such as depparse, will check, but others
            will fail in some unusual manner or just have really bad results
        z,input should be either str, list or Documentr   Nz0Cannot process {} as a list of processors to runc                 S   s   h | ]}|qS r5   r5   r   r5   r5   r6   	<setcomp>  s    z#Pipeline.process.<locals>.<setcomp>,z[Requested processors for pipeline did not have mwt, but pipeline needs mwt, so mwt is addedc                    s   g | ]}| v r|qS r5   r5   r   r   r5   r6   rT     s    z$Pipeline.process.<locals>.<listcomp>)rh   r   r   r   r   ri   r   r   setr   rj   typer   TOKENIZEMWTr   r   r   addr   bulk_processprocess)r>   docr   bulkr   r   r5   r   r6   r     s2   $


zPipeline.processc                 O   s&   dd |D }| j |g|R i |S )zk
        Run the pipeline in bulk processing mode

        Expects a list of str or a list of Docs
        c                 S   s&   g | ]}t |tr|ntg |d qS ))text)r   r   )rR   r   r5   r5   r6   rT     r   z)Pipeline.bulk_process.<locals>.<listcomp>r   )r>   docsargsr   r5   r5   r6   r     s   zPipeline.bulk_process2   c           	      /   s    t tjjst fdd}d}| }|rA| j|g|R i |}|D ]}|| |t|j7 }|V  q(| }|sdS dS )z
        Go through an iterator of documents in batches, yield processed documents

        sentence indices will be counted across the entire iterator
        c               	      sF   g } t  D ]}zt}| | W q ty    |  Y   S w | S rG   )rangenextr   StopIteration)batchr}   next_doc
batch_sizer   r5   r6   
next_batch  s   z#Pipeline.stream.<locals>.next_batchr   N)	r   collectionsabcIteratoriterr   reindex_sentencesri   	sentences)	r>   r   r   r   r   r  sentence_start_indexr   r   r5   r   r6   stream  s   

zPipeline.streamc                    s     fddt D }dd| S )z_
        Assemble the processors in order to make a simple description of the pipeline
        c                    s,   g | ]}| j v rd |t j | f qS )z%s=%s)r   r   r   rN   r5   r6   rT     r   z$Pipeline.__str__.<locals>.<listcomp>z<Pipeline: %s>z, )r   ru   )r>   r   r5   rN   r6   r^     s   zPipeline.__str__c                 C   s   |  ||S rG   r   )r>   r   r   r5   r5   r6   __call__  s   zPipeline.__call__rG   )r   )r.   r/   r0   r   r*   r4   r   r   r:   staticmethodr   r   r_   r   r   r   r	  r^   r
  r5   r5   r5   r6   r      s>    
 -



+

r   c                  C   s   t  } | jdtddd | jdtddd | jd	td
dd |  \}}zt|j}ddi}W n+ ty\   t	
d t|jdd}| }W d    n1 sSw   Y  i }Y nw t|jfd|ji|}||}td| d S )Nz--langr   zLanguage of the pipeline to use)r   r   helpz--input_fileTzInput file to read)r   requiredr  z--processorsztokenize,pos,lemma,depparsezProcessors to user   zOInput file %s does not appear to be a conllu file.  Will read it as a text filezutf-8)encodingr   z{:C})argparseArgumentParseradd_argumentr   parse_known_argsr&   	conll2doc
input_filer'   r   r   openreadr   r;   r   r[   rj   )parserr   
extra_argsr   finpiper5   r5   r6   main  s&   

r  __main__)Vr1   r  r  enumr   rY   	itertoolssysloggingjsonrs   stanza.pipeline._constantsstanza.models.common.constantr   stanza.models.common.docr   %stanza.models.common.foundation_cacher   stanza.models.common.utilsr   stanza.pipeline.processorr   r	   stanza.pipeline.registryr
   r   r    stanza.pipeline.langid_processorr   "stanza.pipeline.tokenize_processorr   stanza.pipeline.mwt_processorr   stanza.pipeline.pos_processorr   stanza.pipeline.lemma_processorr   &stanza.pipeline.constituency_processorr   stanza.pipeline.coref_processorr   "stanza.pipeline.depparse_processorr   #stanza.pipeline.sentiment_processorr   stanza.pipeline.ner_processorr   stanza.resources.commonr   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   !stanza.resources.default_packagesr%   stanza.utils.conllr&   r'   stanza.utils.helper_funcr(   	getLoggerr   r*   r   r8   rD   r   rF   	ExceptionrI   rm   rq   r   r   r   r  r.   r5   r5   r5   r6   <module>   s`    @
0  4

