o
    h                     @   s   d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ dZefdd	Zd
d Zdd Zdd Zdd Zdd Zdd ZeeeeeedZdd Zedkrbeejd  dS dS )    N)find_treebank_dataset_fileUnknownDatasetError)get_default_paths)prepare_dataset)short_name_to_treebank)CoNLL)traindevtestc              	   C   s   t |}| d }| d }tj|dd g }	|D ]5}
t|||
ddd}tj|d||
f }d|d	|d
|d|g}|d urB|d|g t| |		| q|	S )NUDBASELEMMA_CLASSIFIER_DATA_DIRTexist_okconllufail%s.%s.lemmaz--conll_pathz--target_wordz--target_uposz--output_pathz--allowed_lemmas)
r   osmakedirsr   pathjoinextendr   mainappend)paths
short_nameworduposallowed_lemmassectionstreebank
udbase_dir
output_diroutput_filenamessectionfilenameoutput_filenameargs r(   i/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/prepare_lemma_classifier.pyprocess_treebank   s$   
r*   c              	   C   sv  | d }| d }t j|dd g d}ddg}d}d	g}g g g g}|D ]9}	ttD ]2\}
}t|	||d
dd}t|}tj||dd}|j	|d d}t
dt||f  ||
 | q)q#|D ]2}	d}t|	||d
dd}t|}tj||dd}|j	|d d}t
dt||f  |d | q_tt|D ]!\}}t j|d||f }tj||| t
dt||f  qd S )Nr   r   Tr   )zUD_English-EWTzUD_English-GUMzUD_English-GUMRedditzUD_English-LinESzUD_English-PUDzUD_English-Pronounsz'sAUXr   r   z.*)target_wordtarget_uposr   )	save_namezRead %d sentences from %sr
      r   zWrote %s sentences to %s)r   r   	enumerateSECTIONSr   r   	conll2docr   DataProcessorprocess_documentprintlenr   zipr   r   write_output_file)r   r   r!   r"   train_treebankstest_treebanksr,   r-   	sentencesr    section_idxr$   r%   doc	processornew_sentencessection_sentencesr&   r(   r(   r)   process_en_combined#   s<   


rA   c                 C   s    d}d}d }t | |||| d S )Nu   だr+   r*   r   r   r   r   r   r(   r(   r)   process_ja_gsdE   s   	rD   c                 C       d}d}d}t | |||| d S )Nu   شدVERBu   کرد|شدrB   rC   r(   r(   r)   process_fa_perdtT      rG   c                 C   rE   )Nu   केADPu   का|केrB   rC   r(   r(   r)   process_hi_hdtb[   rH   rJ   c                 C   rE   )Nu   أنSCONJu   أَن|أَنَّrB   rC   r(   r(   r)   process_ar_padtb   rH   rL   c                 C   s    d}d}d}t | |||| dS )u  
    All of the Greek lemmas for these words are εγώ or μου

    τους PRON Counter({'μου': 118, 'εγώ': 32})
    μας PRON Counter({'μου': 89, 'εγώ': 32})
    του PRON Counter({'μου': 82, 'εγώ': 8})
    της PRON Counter({'μου': 80, 'εγώ': 2})
    σας PRON Counter({'μου': 34, 'εγώ': 24})
    μου PRON Counter({'μου': 45, 'εγώ': 10})
    u+   τους|μας|του|της|σας|μουPRONNrB   rC   r(   r(   r)   process_el_gdti   s   rN   )ar_padtel_gdten_combinedfa_perdthi_hdtbja_gsdc                 C   sL   t  }td|   | tv rt|  ||  n	t| d|  dtd|   d S )NzProcessing %szdataset z5 currently not handled by prepare_lemma_classifier.pyzDone processing %s)r   r5   DATASET_MAPPINGr   )dataset_namer   r(   r(   r)   r      s   r   __main__   )r   sysstanza.utils.datasets.commonr   r   stanza.utils.default_pathsr   stanza.models.lemma_classifierr   +stanza.models.common.short_name_to_treebankr   stanza.utils.conllr   r1   r*   rA   rD   rG   rJ   rL   rN   rU   r   __name__argvr(   r(   r(   r)   <module>   s4    "
