o
    h                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ edZdZdd Z	d	d
 Z
e	dfddZe	dfddZedkr@e  dS dS )    N)tqdm)
parse_tree)tree_readerz[{]turkish=([^}]+)[}])DTDETsvpAFVPCONJINTJz-XXX-c           	      C   s   t | }t|dkrtd|d }| }g }|D ]&}t|}|du r-td||d}|	dd	dd	}|
| q||}||g}td
d |D r]td||S )z
    Reads in a tree, then extracts specifically the word from the specific format used

    Also converts LCB/RCB as needed
       zTree file had two trees!r   NzCould not find word in |{}|z-LCB-{z-RCB-}c                 s   s    | ]}|t v V  qd S )N)DISALLOWED_LABELS).0label r   n/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/convert_starlang.py	<genexpr>%   s    zread_tree.<locals>.<genexpr>z#found an unexpected phrasal node {})r   
read_treeslen
ValueErrorleaf_labels
TURKISH_REsearchformatgroupreplaceappendreplace_wordsget_unique_constituent_labelsany)	texttreestreelabels
new_labelsr   matchword
con_labelsr   r   r   	read_tree   s$   



r*   c           	      C   s   g }| D ]H}t |dd}| }W d    n1 sw   Y  z||}|d ur.|| W q tyL } z|rBtd||| W Y d }~qd }~ww |S )Nzutf-8)encodingz<-----------------
Found an error in {}: {} Original text: {})openreadr   r   printr   )		filenames
conversionlogr#   filenamefinr"   r$   er   r   r   
read_files)   s    

r5   Tc           
         s   t | tr| f} g }g }g }| D ],  fddt D }|dd |D  |dd |D  |dd |D  qtdt|t| t|   tt|||d}tt|||d}tt|||d}	|||	fS )z
    Read the starlang trees, converting them using the given method.

    read_tree or any other conversion turns one file at a time to a sentence.
    log is whether or not to log a ValueError - the NER division has many missing labels
    c                    s   g | ]	}t j |qS r   )ospathjoinr   xr7   r   r   
<listcomp>F       z!read_starlang.<locals>.<listcomp>c                 S      g | ]	}| d r|qS )z.trainendswithr9   r   r   r   r<   G   r=   c                 S   r>   )z.devr?   r9   r   r   r   r<   H   r=   c                 S   r>   )z.testr?   r9   r   r   r   r<   I   r=   zReading %d total filesr0   r1   )	
isinstancestrr6   listdirextendr.   r   r5   r   )
pathsr0   r1   train_files	dev_files
test_files
tree_filestrain_treebankdev_treebanktest_treebankr   r;   r   read_starlang7   s   
 
rN   c                 C   sb   g d}t || |d\}}}tdt|  tdt|  tdt|  t|d  |||fS )N)z<extern_data/constituency/turkish/TurkishAnnotatedTreeBank-15z=extern_data/constituency/turkish/TurkishAnnotatedTreeBank2-15z=extern_data/constituency/turkish/TurkishAnnotatedTreeBank2-20rA   z	Train: %dzDev: %dzTest: %dr   )rN   r.   r   )r0   r1   rF   rK   rL   rM   r   r   r   mainR   s   
rO   __main__)r6   rer   stanza.models.constituencyr   r   compiler   r   r*   r5   rN   rO   __name__r   r   r   r   <module>   s   

