o
    h                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
 	 defddZG dd dZdd	d
ZedkrAe  dS dS )    N)utils)ListTupleAnypathc                 C   s   t jjj| S )zh"
    loads in a Stanza document object from a path to a CoNLL file containing annotated sentences.
    )stanzar   conllCoNLL	conll2doc)r    r   i/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/lemma_classifier/prepare_dataset.pyload_doc_from_conll_file   s   r   c                   @   s^   e Zd Zdedee defddZdd Zdee fd	d
Ze	dd Z
deddfddZdS )DataProcessortarget_wordtarget_uposallowed_lemmasc                 C   s(   || _ t|| _|| _t|| _d S N)r   recompiletarget_word_regexr   r   )selfr   r   r   r   r   r   __init__   s   zDataProcessor.__init__c                 C   s0   |j D ]}| j|jr|j| jv r dS qdS )NTF)wordsr   	fullmatchtextuposr   )r   sentencewordr   r   r   keep_sentence    s
   
zDataProcessor.keep_sentencereturnc                 C   s@   g }t |jD ]\}}| j|jr|j| jv r|| q|S )zt
        Finds all occurrences of self.target_word in tokens and returns the index(es) of such occurrences.
        )	enumerater   r   r   r   r   r   append)r   r   occurrencesidxtokenr   r   r   find_all_occurrences&   s   
z"DataProcessor.find_all_occurrencesc                 C   s   t | ddd@}|d |dt|  |d d}|D ]}|s,|d d	}n|d
 |t| q |d W d    d S 1 sJw   Y  d S )Nzw+zutf-8)encodingz{
z  "upos": %s,
z  "sentences": [Fz
    Tz,
    z
  ]
}
)openwritejsondumps)	save_namer   	sentencesoutput_fwrote_sentencer   r   r   r   write_output_file0   s   



"zDataProcessor.write_output_filer+   Nc           	   	      s   g }|j D ]C |  rHdd  jD }|  }|D ],}| j j| jrG fddtt jD }t|}|	||| j| jd qq|rS| 
|| j| |S )a  
        Takes any sentence from `doc` that meets the condition of `keep_sentence` and writes its tokens, index of target word, and lemma to `save_name`

        Sentences that meet `keep_sentence` and contain `self.target_word` multiple times have each instance in a different example in the output file.

        Args:
            doc (Stanza.doc): Document object that represents the file to be analyzed
            save_name (str): Path to the file for storing output
        c                 S   s   g | ]}|j qS r   )r   ).0r$   r   r   r   
<listcomp>O   s    z2DataProcessor.process_document.<locals>.<listcomp>c                    s   g | ]} j | jqS r   )r   r   )r0   ir   r   r   r1   V   s    )r   	upos_tagsindexlemma)r,   r   r   r%   r   r   r6   rangelenr!   r/   r   )	r   docr+   r,   tokensindexesr#   r4   
num_tokensr   r3   r   process_document@   s&   




zDataProcessor.process_document)__name__
__module____qualname__strr   r   r   intr%   staticmethodr/   r=   r   r   r   r   r      s    

r   c           	      C   s   t  }|jdttjtjtddd |jdtddd |jdtd	d
d |jdtddd |jdtddd |	| } | j
}| j}| j}| j}t| } | D ]}t| d| |   qQt|}t| d |g|d}|||S )Nz--conll_pathzen_gum-ud-train.conlluz#path to the conll file to translate)typedefaulthelpz--target_wordz'szToken to classify on, e.g. 's.z--target_uposAUXzupos on target tokenz--output_pathztest_output.txtzPath for output filez--allowed_lemmasz.*z?A regex for allowed lemmas.  If not set, all lemmas are allowedz: r   )r   r   r   )argparseArgumentParseradd_argumentrA   osr   joindirname__file__
parse_args
conll_pathr   output_pathr   varsprintr   r   r=   )	argsparserrP   r   rQ   r   argr9   	processorr   r   r   mainc   s"   $
rX   __main__r   )rH   r)   rK   r   r   stanza.models.lemma_classifierr   typingr   r   r   rA   r   r   rX   r>   r   r   r   r   <module>   s    
K
