o
    h*                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZ i ddddd	d
ddddddddddddddddddddddddd d!d"d#i d$d#d%d&d'd#d(d)d*d+d,d-d.d/d0d1d2d1d3d1d4d5d6d7d8d7d9d:d;d<d=d>d?d-dd@dAdBddCdDZ	dEdF Z
dGdH ZdIdJ Zeeg dKee	  Zeeg dLee	  ZdXdOdPZdYdQdRZdSdT ZdUdV ZedWkre  dS dS )ZaF  
Script for processing the VTB files and turning their trees into the desired tree syntax

The VTB original trees are stored in the directory:
VietTreebank_VLSP_SP73/Kho ngu lieu 10000 cay cu phap
The script requires two arguments:
1. Original directory storing the original trees
2. New directory storing the converted trees
    N)defaultdict)
read_treesMixedTreeErrorUnlabeledTreeErrorz(ADV-MDPz(RP-MDPz(MPDz(MDPz(MP z(NP z(MP(z(NP(z(Np(z(Np (z(NP (z(NLOCz(NP-LOCz(N-P-LOCz(N-p-locz(NPDOBz(NP-DOBz(NPSUBz(NP-SUBz(NPTMPz(NP-TMPz(PPLOCz(PP-LOCz(SBA z(SBAR z(SBA-z(SBAR-z(SBA(z(SBAR(z(SBASz(SBARz(SABRz(SE-SPLz(S-SPLz(SBARRPPADVzPP-ADVz(PR (z(PP (z(PPPz(PPVP0ADVzVP-ADVz(S1z(Sz(S2z(S3zBP-SUBzNP-SUBAPPPDzAP-PPDAPPRDzNp--HzNp-Hz(WPNPz(WHNPz(WHRPPz(WHRPz(PVz(WHRP u   (WHPP (Pro-h nào))z(WHRP (Pro-h Sao))z(NP)z(Mpdz(Whadv u   (Whpr (Pro-h nào))z(Whpr (Pro-h Sao))z(Tp-tmpz(Ypc                 C   s"   t  D ]
\}}| ||} q| S )N)	REMAPPINGitemsreplace)treeoldnew r   i/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/vtb_convert.pyunify_labelG   s   r   c                 C   s4   d}| D ]}|dkr|d7 }q|dkr|d8 }q|S )z}
    Checks if the tree is properly closed
    :param tree: tree as a string
    :return: True if closed otherwise False
    r   (   )r   )r   countcharr   r   r   count_paren_parityN   s   
r   c                 C   s   |  ds
| drdS dS )a  
    Check if a line being read is a valid constituent

    The idea is that some "trees" are just a long list of words with
    no tree structure and need to be eliminated.

    :param line: constituent being read
    :return: True if it has open OR closing parenthesis.
    r   r   TF)
startswithendswith)liner   r   r   is_valid_line]   s   
r   )WPYPSNPSTCUPCz(TPXpXPWHVPWHPRNOWHADV(SC ((VOC ((Adv ((SP (ADV-MDP(SPL(ADV ((V-MWE ()r   r   r   r    r!   r"   r#   r$   r%   r(   r)   r*   r+   r,   r-   r.   r/   TFc                 C   s  |rt }nt}tt}t| ddd}t|ddd}	| }
d}d}d}t|
D ]w\}}d| }|dkr<q+|d	ksE|	d
rid}|d7 }d}|	d
rh|dd }|
ds`J t|dd }q+|dkr|r| dkr|d d| | q+|d7 }t|}|dkr|d d| || q+|dk r|d d| || q+|r|dddd}zt|}t|dkr|d d | | W q+t|dkr|d d| | W q+|d  s|d! d"| || W q+|rt|}d}|D ]}||dkrd}|| d#|| ||  nq|r$W q+|rD|du r=|d$ d%| | |	d	 n|	d&|  |	| |rQ|	d' d}d}d}W q+ tyn   |d( d)| || Y q+ ty   |d* d+| || Y q+w t|r|r||7 }q+|r|d, d-| ||| d}q+W d   n1 sw   Y  W d   |S W d   |S 1 sw   Y  |S ).z
    :param orig_file: original directory storing original trees
    :param new_file: new directory storing formatted constituency trees
    This function writes new trees to the corresponding files in new_file
    rzutf-8)encodingw NF z<s>z<s id=z(ROOT T=r   >z</s>z(ROOTemptyzEmpty tree in {} line {}z)
r   unclosedz#Unclosed tree from {} line {}: |{}|extra_parenszGExtra parens at end of tree from {} line {} for having extra parens: {}RBKTz-RRB-LBKTz-LRB-multiplez4Multiple trees in one xml annotation from {} line {}untagged_leafz2Tree with non-preterminal leaves in {} line {}: {}z"Weird label {} from {} line {}: {}
missing_idzMissing ID from {} at line {}z
<s id=%d>
z</s>
mixedz1Mixed leaves and constituents from {} line {}: {}	unlabeledz+Unlabeled nodes in tree from {} line {}: {}invalidzEInvalid tree error in {} line {}: |{}|, rejected because of line |{}|)WEIRD_LABELS_2023WEIRD_LABELSr   listopen	readlines	enumeratejoinsplitr   r   intstripappendformatr   r   r   lenall_leaves_are_preterminalsr   findwriter   r   r   )	orig_filenew_file
fix_errorsconvert_bracketsupdated_tagset	write_idsweird_labelserrorsreaderwritercontentr   tree_idreading_treeline_idxr   parityprocessed_trees	bad_labelweird_labelr   r   r   convert_filer   s   $




*VVVre   c                 C   s  t t}| D ]3}tjtj|d \}	}
tj||	}| d}t||||||}|D ]}|| ||  q-qt	|
 dkrHtd d S td t|
 }|rn|D ]}td|  td||  t  qVt  |D ]}td|t	|| f  qpd S )	Nr7   z.mrgr   zAll errors were fixed!zFound the following errors:z--------- %10s -------------z

z%s: %d)r   rE   ospathsplitextrJ   rI   re   extendrO   keysprintsorted)	file_listnew_dirverboserU   rV   rW   rX   rZ   filename	base_name_new_pathnew_file_path
new_errorserj   r   r   r   convert_files   s,   
rw   c                    s*   t  } fdd|D }t|| d S )Nc                    s.   g | ]}t j|d  dkrt j |qS )r   z.raw)rf   rg   rh   rI   ).0forig_dirr   r   
<listcomp>   s   . zconvert_dir.<locals>.<listcomp>)rf   listdirrw   )r{   rn   rm   r   rz   r   convert_dir   s   
r~   c                  C   sJ   t jdd} | jddd | jddd |  }|j}|j}t|| dS )	z
    Converts files from the 2009 version of VLSP to .mrg files
    
    Process args, loop through each file in the directory and convert
    to the desired tree format
    z7Script that converts a VTB Tree into the desired format)descriptionr{   z>The location of the original directory storing original trees )helprn   z=The location of new directory storing the new formatted treesN)argparseArgumentParseradd_argument
parse_argsorg_dirrn   r~   )parserargsr   rn   r   r   r   main   s   r   __main__)TFFF)FTFFF)__doc__r   rf   collectionsr   &stanza.models.constituency.tree_readerr   r   r   r
   r   r   r   rl   setrE   rj   rD   rC   re   rw   r~   r   __name__r   r   r   r   <module>   s    
	
 !$%5

c
