o
    –hÇâ  ã                   @   sn  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZ ddlm  m  mZ ddlmZmZmZmZmZmZ ddlm  m  m  mZ ddlm  m  m  mZ ddlm  m  m  mZ ddlm  m  m  mZ ddl m  m  m  m!Z! ddl"m  m  m  m#Z# dd„ Z$dzd	d
„Z%dd„ Z&dd„ Z'd{dd„Z(d{dd„Z)d|dd„Z*dd„ Z+e ,d¡Z-d}dd„Z.d~dd„Z/dd „ Z0d!d"„ Z1g d#¢Z2e ,d$d% 3e2¡ d& d% 3e2¡ d' ¡Z4g d(¢Z5g d)¢Z6dd+d,„Z7d-d.„ Z8e ,d/¡Z9d0d1„ Z:d€d3d4„Z;dd6d7„Z<d8d9„ Z=d:d;„ Z>d<d=„ Z?d>d?„ Z@d@dA„ ZAdBdC„ ZBdDdE„ ZCdFdG„ ZDdHdI„ ZEdJdK„ ZFdLdM„ ZGdNdO„ ZHdPdQ„ ZIdRdS„ ZJdTdU„ ZKdVdW„ ZLdXdY„ ZMdZd[„ ZNd\d]„ ZOd^d_„ ZPd`da„ ZQeEeOePeQeCeNdbœZReHeGeIdcœZSddde„ ZTdfZUdgdh„ ZVdidj„ ZWdkdl„ ZXd‚dmdn„ZYd{dodp„ZZd2Z[dqdr„ Z\dsdt„ Z]dudv„ Z^dwdx„ Z_e`dykrµe_ƒ  dS dS )ƒaƒ  
Prepares train, dev, test for a treebank

For example, do
  python -m stanza.utils.datasets.prepare_tokenizer_treebank TREEBANK
such as
  python -m stanza.utils.datasets.prepare_tokenizer_treebank UD_English-EWT

and it will prepare each of train, dev, test

There are macros for preparing all of the UD treebanks at once:
  python -m stanza.utils.datasets.prepare_tokenizer_treebank ud_all
  python -m stanza.utils.datasets.prepare_tokenizer_treebank all_ud
Both are present because I kept forgetting which was the correct one

There are a few special case handlings of treebanks in this file:
  - all Vietnamese treebanks have special post-processing to handle
    some of the difficult spacing issues in Vietnamese text
  - treebanks with train and test but no dev split have the
    train data randomly split into two pieces
  - however, instead of splitting very tiny treebanks, we skip those
é    N)ÚCounter)Útreebank_to_short_name)Úread_sentences_from_conlluÚwrite_sentences_to_conlluÚwrite_sentences_to_fileÚINT_REÚMWT_REÚMWT_OR_COPY_REc                 C   sR   | › d|› d|› d}|› d|› d|› d}t d||f ƒ t|ƒ}t||ƒ d S )Nú/Ú.ú.conlluzCopying from %s to %s)Úprintr   r   )Útokenizer_dirÚtokenizer_fileÚdest_dirÚ	dest_fileÚ
short_nameÚoriginalÚcopiedÚsents© r   úk/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/prepare_tokenizer_treebank.pyÚcopy_conllu_file-   s
   r   Tc           
      C   s&  t j|dd t| ƒ}| d¡d }t ¡ s}t|ƒ}||d< t ¡ }	||	_	d|	_
t| |||	ƒ t j|dd |du r=t}||d|d	|ƒ ||d
|d|ƒ ||d|d|ƒ |tjjury|tjjurt|d|d
|ƒ t|d|d|ƒ W d  ƒ dS W d  ƒ dS W d  ƒ dS 1 sŒw   Y  dS )z—
    This utility method copies only the conllu files to the given destination directory.

    Both POS, lemma, and depparse annotators need this.
    T©Úexist_okÚ_r   ÚTOKENIZE_DATA_DIRFNz
train.goldztrain.inzdev.goldzdev.inz	test.goldztest.in)ÚosÚmakedirsr   ÚsplitÚtempfileÚTemporaryDirectoryÚdictÚargparseÚ	NamespaceÚaugmentÚprepare_labelsÚprocess_treebankr   ÚcommonÚ	ModelTypeÚPOSÚDEPPARSE)
ÚtreebankÚ
model_typeÚpathsr   Úpostprocessr%   r   Úshort_languager   Úargsr   r   r   Úcopy_conllu_treebank7   s0   
ëí"ír2   c           	      C   s¨   t  d¡ t|ƒ}t  |¡ tt|ƒt ƒ}|dksJ dƒ‚t|ƒ| }|d |… }||d … }td|||f ƒ tt|ƒ› d|› d|› dƒ t||ƒ t||ƒ dS )	NéÒ  é   z"Dev sentence number less than one.zPTrain/dev split not present.  Randomly splitting train file from %s to %s and %sz total sentences found: z in train, z in devT)	ÚrandomÚseedr   ÚshuffleÚintÚlenÚXV_RATIOr   r   )	r,   Útrain_input_conlluÚtrain_output_conlluÚdev_output_conllur   Ún_devÚn_trainÚ	dev_sentsÚtrain_sentsr   r   r   Úsplit_train_fileY   s   



rB   c                 C   s8   | r| dkrdS | dkrdS |   d¡}tdd„ |D ƒƒS )Nr   FúSpaceAfter=NoTú|c                 s   s    | ]}|d kV  qdS )rC   Nr   )Ú.0Útr   r   r   Ú	<genexpr>x   s   € z%has_space_after_no.<locals>.<genexpr>)r   Úany)ÚpieceÚtagsr   r   r   Úhas_space_after_nor   s   
rK   c                 C   s`   | dks| dkrd} | S |   d¡r|  dd¡} | S |  d¡dkr(|  dd¡} | S |r.tdƒ‚| S )zº
    Removes a SpaceAfter=No annotation from a single piece of a single word.
    In other words, given a list of conll lines, first call split("	"), then call this on the -1 column
    rC   ú|SpaceAfter=Nor   zSpaceAfter=No|Ú r   z5Could not find SpaceAfter=No in the given notes field)Ú
startswithÚreplaceÚfindÚ
ValueError)rI   Úfail_if_missingr   r   r   Úremove_space_after_no{   s   
úüþrS   c                 C   s(   | dkrdS |rt | ƒrtdƒ‚| d S )Nr   rC   z1Given notes field already contained SpaceAfter=NorL   )rK   rQ   )rI   Úfail_if_foundr   r   r   Úadd_space_after_no‹   s   rU   çš™™™™™©?c                 C   st  g }| D ]±}t |ƒdk rtdƒ‚d}|d  d¡r#|d  d¡r#d}n.|d  d¡r4|d	  d¡r4d	}n|d  d
¡rE|d  d¡rEd}ntd|d  ¡ d  ƒ‚t ¡ |krXq|| d dv rµ|| d dvrµt|d  ¡ d ƒrµt |d  ¡ d ƒdkrµt|ƒ}|| dd… d || d  ||< |d  d¡}t|d ƒ|d< d |¡|d< ||ks°J ‚| 	|¡ q| | S )a§  
    Basic Arabic tokenizer gets the trailing punctuation wrong if there is a blank space.

    Reason seems to be that there are almost no examples of "text ." in the dataset.
    This function augments the Arabic-PADT dataset with a few such examples.
    TODO: it may very well be that a lot of tokeners have this problem.

    Also, there are a few examples in UD2.7 which are apparently
    headlines where there is a ' . ' in the middle of the text.
    According to an Arabic speaking labmate, the sentences are
    headlines which could be reasonably split into two items.  Having
    them as one item is quite confusing and possibly incorrect, but
    such is life.
    é   z"Read a surprisingly short sentenceNr   z# newdocé   ú# textz# newparé   z	# sent_idr4   zCould not find text line in %séÿÿÿÿ)r   õ   ØŸú?ú!éþÿÿÿ)r   r\   r]   r^   ú r`   ú	)
r9   rQ   rN   r   r5   rK   ÚlistrS   ÚjoinÚappend)r   ÚratioÚ	new_sentsÚsentenceÚ	text_lineÚnew_sentÚpiecesr   r   r   Úaugment_arabic_padt•   s8   ÿ$
€rk   c                 C   s&  g }| D ]	}|d   d¡std|d  ƒ‚|d   d¡s%td|d  ƒ‚|d  d¡s4|d d	 d
vr5q|d d	 d
v ra|d d dkra|d dd… dkra|d dd… dkratd|d  ƒ‚t ¡ dk r˜t|ƒ}|d dd… |d d	  |d< |d dd… |d d	  |d< |d d |d< | |¡ |d  d¡dkrt ¡ dk rt|ƒ}|d  d¡}|d d|d … |d |d…  |d< |d  d¡}|d d|d … |d |d…  |d< t|ƒD ]!\}}|dk rðqç| d¡d dkr||d  d ||d <  nqç| |¡ q| | S )aÜ  
    Add a few sentences with modified punctuation to Telugu_MTG

    The Telugu-MTG dataset has punctuation separated from the text in
    almost all cases, which makes the tokenizer not learn how to
    process that correctly.

    All of the Telugu sentences end with their sentence final
    punctuation being separated.  Furthermore, all commas are
    separated.  We change that on some subset of the sentences to
    make the tools more generalizable on wild text.
    r4   rY   z3Expected the second line of %s to start with # textr   rZ   z
# translitz7Expected the second line of %s to start with # translitz. . .r[   )r   r]   r^   r_   r`   éýÿÿÿNz ..éüÿÿÿz ...z¨Sentence %s does not end with space-punctuation, which is against our assumptions for the te_mtg treebank.  Please check the augment method to see if it is still neededçš™™™™™¹?rL   ú,rW   ra   )	rN   rQ   Úendswithr5   rb   rd   rP   Ú	enumerater   )r   rf   rg   Únew_sentenceÚindexÚidxÚwordr   r   r   Úaugment_teluguÄ   s@   
H  
"((þ
€rv   z ([a-zA-Z]+)[,] ([a-zA-Z]+) ç¸…ëQ¸ž?c                 C   sà  g }| D ]ß}t |ƒD ]\}}| d¡r nq
qt || ¡}|rãt ¡ |k rãt |ƒD ]6\}}| d¡r4q*| d¡d | d¡krAq*||d   d¡d dkrOq*||d   d¡d | d¡kr`q* |t|ƒd krjq||d  }	|	 d¡}
|
d dks}J ‚t|
d ƒ|
d< d 	|
¡}	|d|d … |	g ||d d…  }||  
| d¡d	 | d¡ ¡}t| d¡d	 | d¡ ƒ}|| d|… | d¡ d | d¡ || || d…  }|||< | |¡ qtd
t|ƒ ƒ | | S )a9  Find some fraction of the sentences which match "asdf, zzzz" and squish them to "asdf,zzzz"

    This leaves the tokens and all of the other data the same.  The
    only change made is to change SpaceAfter=No for the "," token and
    adjust the #text line, with the assumption that the conllu->txt
    conversion will correctly handle this change.

    This was particularly an issue for Spanish-AnCora, but it's
    reasonable to think it could happen to any dataset.  Currently
    this just operates on commas and ascii letters to avoid
    accidentally squishing anything that shouldn't be squished.

    UD_Spanish-AnCora 2.7 had a problem is with this sentence:
    # orig_file_sentence 143#5
    In this sentence, there was a comma smashed next to a token.

    Fixing just this one sentence is not sufficient to tokenize
    "asdf,zzzz" as desired, so we also augment by some fraction where
    we have squished "asdf, zzzz" into "asdf,zzzz".

    This exact example was later fixed in UD 2.8, but it should still
    potentially be useful for compensating for typos.
    rY   ú#ra   r4   ro   rZ   r[   Nú, z3Added %d new sentences with asdf, zzzz -> asdf,zzzz)rq   rN   ÚCOMMA_SEPARATED_REÚsearchr5   r   Úgroupr9   rU   rc   rP   rd   r   )r   re   rf   rg   Útext_idxrh   Úmatchrt   ru   Úcommarj   ri   Útext_offsetÚtext_lenÚnew_textr   r   r   Úaugment_comma_separationsò   sF   
ÿ
 

&"<
€rƒ   ç{®Gáz”?c                 C   s.  g }d}| D ]}t   ¡ |kr| |¡ qd}t|ƒD ]X\}}| d¡r%q|dks1|t|ƒd kr2q| d¡}|d dkrst|d ƒss||d  }	t|	 d¡d ƒsSq||d  }
t |
 d¡d ¡rdqt |	 d¡d ¡roqd	} nq|s|| |¡ qt	|ƒ}||  d¡}t
|d ƒ|d< d |¡||< ||d   d¡}|d }	t|d ƒ|d< d |¡||d < ||d   d¡d }
t|ƒD ]A\}}| d
¡r|	d |
 }|	d |
 }| |¡}|dk rìtd| d | ƒ‚|d|… | ||t|ƒ d…  }|||<  nqÃ| |¡ |d }qtd| ƒ |S )al  
    Move the comma from after a word to before the next word some fraction of the time

    We looks for this exact pattern:
      w1, w2
    and replace it with
      w1 ,w2

    The idea is that this is a relatively common typo, but the tool
    won't learn how to tokenize it without some help.

    Note that this modification replaces the original text.
    r   Frx   rZ   ra   r4   ro   r[   TrY   ry   z ,z[Unexpected #text line which did not contain the original text to be modified.  Looking for
Ú
Nz&Swapped 'w1, w2' for 'w1 ,w2' %d times)r5   rd   rq   rN   r9   r   rK   r	   r~   rb   rU   rc   rS   rP   ÚRuntimeErrorr   )r   re   rf   Únum_operationsrg   ÚfoundÚword_idxru   rj   Ú	prev_wordÚ	next_wordrr   r}   rh   Ú	old_chunkÚ	new_chunkÚnew_text_liner   r   r   Úaugment_move_comma9  sh   



€

$ø


r   c           	      C   s2  d}d}t | ƒD ]8\}}t|ƒdkrtd| ƒ‚|D ]}| d¡r5| d¡dkr*d}| d¡dkr3d} nqtd||d f ƒ‚q|sE|sG| S g }| D ]K}t ¡ d	krY| |¡ qKg }|D ]3}| d¡rn| | dd¡¡ q]| d
¡ry| |¡ q]| 	d¡}|d  dd¡|d< | d 
|¡¡ q]| |¡ qK|S )uå   
    If there are no instances of â€™ in the dataset, but there are instances of ',
    we replace some fraction of ' with â€™ so that the tokenizer will recognize it.

    # TODO: we could do it the other way around as well
    Fr   z$Got a blank sentence in position %d!rY   ú'Tu   â€™z5Cannot find '# text' in sentences %d.  First line: %srV   rx   ra   r4   )rq   r9   ÚAssertionErrorrN   rP   rQ   r5   rd   rO   r   rc   )	r   Úhas_unicode_aposÚhas_ascii_aposÚsent_idxÚsentÚlinerf   ri   rj   r   r   r   Úaugment_apos  sB   
ûþ



r—   c           
      C   s
  d}d}| D ]#}|D ]}|  d¡rq
| d¡}|d dkr d}q
|d dkr(d}q
q|s.|s0| S g }d}| D ]F}t ¡ d	krD| |¡ q6d}g }	|D ]&}|  d¡rW|	 |¡ qJ| d¡}|d dkrhd|d< d}|	 d |¡¡ qJ| |	¡ |r||d }q6td
| ƒ |S )u1   
    Replaces a fraction of '...' with 'â€¦'
    Frx   ra   r4   z...Tu   â€¦r   rn   z2Changed %d sentences to use fancy unicode ellipses)rN   r   r5   rd   rc   r   )
r   Úhas_ellipsesÚhas_unicode_ellipsesr•   r–   rj   rf   Únum_updatedrˆ   ri   r   r   r   Úaugment_ellipsesº  sJ   

€ù	



€r›   )ú"õ   â€œõ   â€õ   Â«õ   Â»õ   ã€Œõ   ã€õ   ã€Šõ   ã€‹õ   â€žõ   â€³z(.?)[rM   z](.+)[z](.?))
rœ   r   rž   rŸ   r    r¡   r£   r¥   r¥   r¦   )
rœ   r   rž   r    rŸ   r¢   r¤   rž   r   r¦   ç333333Ã?c                 C   sv  t tƒt tƒks
J ‚tƒ }g }| D ]›}t ¡ |kr| |¡ qtdd„ |D ƒƒ}|dkr2| |¡ qt tt tƒƒ¡}t| }t| }|||  d ||| < g }	d}
|D ]3}| 	d¡rb|	 |¡ qU| 
d¡}|d tv rƒ|
rt||d< n||d< d}
|	 d |¡¡ qU|	 |¡ qUt|	ƒD ]\}}| 	d	¡r¦d
||f }t ||¡}||	|< q| |	¡ qtd t| ¡ ƒ|¡ƒ |S )uÓ   
    Go through the sentences and replace a fraction of sentences with alternate quotes

    TODO: for certain languages we may want to make some language-specific changes
      eg Danish, don't add Â«...Â»
    c                 s   s0    | ]}|  d ¡s| d¡d tv rdV  qdS )rx   ra   r4   N)rN   r   ÚQUOTES©rE   Úxr   r   r   rG   ÿ  s   € ÿþþz!augment_quotes.<locals>.<genexpr>rZ   r4   Frx   ra   TrY   z
\1%s\2%s\3zAugmented {} quotes: {})r9   ÚSTART_QUOTESÚ
END_QUOTESr   r5   rd   ÚsumÚchoiceÚrangerN   r   r¨   rc   rq   Ú	QUOTES_REÚsubr   ÚformatÚvalues)r   re   Úcountsrf   r•   Úcount_quotesÚ	quote_idxÚstart_quoteÚ	end_quoteri   Ú	saw_startr–   rj   r}   rh   ÚreplacementrŽ   r   r   r   Úaugment_quotesí  sL   






€r»   c                 C   s(   t | ƒD ]\}}| d¡r|  S qdS )z3
    Return the index of the # text line or -1
    rY   r[   )rq   rN   )rg   rt   r–   r   r   r   Úfind_text_idx-  s
   
ÿr¼   z[0-9]c                 C   s  |   d¡r| S |  d¡}t |d ¡r5|d  d¡}dt|d ƒ| t|d ƒ| f |d< d |¡} | S t |d ¡rV|d jddd}d	t|d ƒ| |d f |d< nt |d ¡setd
|d  ƒ‚t	t|d ƒ| ƒ|d< |d dkr‹t|d ƒ}|dkr‹t	t|ƒ| ƒ|d< |d dkrÁ|d jddd}t
 |d ¡r§td|  ƒ‚t|d ƒdkrÁt	t|d ƒ| ƒd |d  |d< d |¡} | S )zf
    Adjust all indices in the given sentence by delta.  Useful when removing a word, for example
    rx   ra   r   ú-z%d-%dr4   r   )Úmaxsplitz%d.%szUnknown index type: %sé   r   é   ú:z+Need to handle multiple additional deps:
%s)rN   r   r   r~   r8   rc   r	   r   ÚNotImplementedErrorÚstrÚDIGIT_REr{   )r–   Údeltarj   ÚindicesÚindex_piecesÚdepÚ
dep_piecesr   r   r   Úchange_indices8  s4   

(
"$
rÊ   çš™™™™™É?c                 C   s  g }| D ]m}t   ¡ |krqt|ƒ}|| }| d¡dkrqt|ƒD ]
\}}| d¡r+q! |t|ƒd kr8tdƒ‚| d¡}|d dkrDqt|d ƒrMd}	nd}	|d|… ||d d…  }
| 	|	d	¡|
|< d
d„ |
D ƒ}
| 
|
¡ qt|ƒdkr€tdt|ƒ ƒ | | S )uþ   
    If a sentence starts with certain punct marks, occasionally use the same sentence without the initial punct.

    Currently this just handles Â¿
    This helps languages such as CA and ES where the models go awry when the initial Â¿ is missing.
    õ   Â¿r4   rx   z+Unexpectedly an entire sentence is commentsra   r[   u   Â¿ NrM   c                 S   s   g | ]}t |d ƒ‘qS )r[   )rÊ   r©   r   r   r   Ú
<listcomp>€  s    z)augment_initial_punct.<locals>.<listcomp>r   u.   Added %d sentences with the leading Â¿ removed)r5   r¼   Úcountrq   rN   r9   rQ   r   rK   rO   rd   r   )r   re   rf   r•   r}   rh   rt   r–   rj   Úreplace_textri   r   r   r   Úaugment_initial_punct[  s6   

rÐ   rn   c           
      C   s:  g }| D ]}t |ƒ}|| }| d¡dks| d¡dkr |   S q| D ]g}t ¡ |kr,q#t |ƒ}|| }| d¡dkrC| d¡dkrCq#| dd¡ dd¡}t|ƒ}|||< t|ƒD ]+\}}| d¡rcqY| d¡}	|	d dkrsd|	d< n
|	d dkr}d|	d< d |	¡||< qY| 	|¡ q#t
|ƒdkr™td	t
|ƒ ƒ | | S )
zF
    If there are no sentences with [], transform some () into []
    ú[r   ú]ú(ú)rx   ra   r4   z<Added %d sentences with parens replaced with square brackets)r¼   rÎ   r5   rO   rb   rq   rN   r   rc   rd   r9   r   )
r   re   rf   r•   r}   rh   ri   rt   r–   rj   r   r   r   Úaugment_brackets‰  s<   þ


rÕ   c                 C   s<   t | ƒ}t|ƒ}t|ƒ}t|ƒ}t|ƒ}t|ƒ}t|ƒ}|S )uÎ   
    If there are no instances of â€™ in the dataset, but there are instances of ',
    we replace some fraction of ' with â€™ so that the tokenizer will recognize it.

    Also augments with ... / â€¦
    )r—   r»   r   rƒ   rÐ   r›   rÕ   )r   rf   r   r   r   Úaugment_punct²  s   rÖ   c                 C   s(   t  d¡ t| ƒ}||ƒ}t||ƒ d S )Nr3   )r5   r6   r   r   )Úinput_conlluÚoutput_conlluÚaugment_functionr   rf   r   r   r   Úwrite_augmented_datasetÅ  s   
rÚ   c                 C   s   g }| D ]A}g }|D ]5}|  d¡r| |¡ q
| d¡}|d dkr'd|d< n|d  d¡dkr1ntdƒ‚d |¡}| |¡ q
| |¡ q|S )zp
    Makes sure every word in the list of sentences has SpaceAfter=No.

    Returns a new list of sentences
    rx   ra   r[   r   rC   r   Úoops)rN   rd   r   rP   rQ   rc   )r   rf   rg   rr   ru   rj   r   r   r   Úremove_spaces_from_sentencesÒ  s"   




rÜ   c                 C   s   t | ƒ}t|ƒ}t||ƒ dS )zˆ
    Turns a dataset into something appropriate for building a segmenter.

    For example, this works well on the Korean datasets.
    N)r   rÜ   r   )r×   rØ   r   rf   r   r   r   Úremove_spacesë  s   rÝ   c           	      C   sT   t  d| |d¡}t  d| |d¡}t|ƒt|ƒ }| d¡}|r#t|ƒ}t||ƒ dS )zù
    Builds a combined dataset out of multiple Korean datasets.

    Currently this uses GSD and Kaist.  If a segmenter-appropriate
    dataset was requested, spaces are removed.

    TODO: we need to handle the difference in xpos tags somehow.
    zUD_Korean-GSDÚconlluzUD_Korean-KaistÚ_segN)r(   Úfind_treebank_dataset_filer   rp   rÜ   r   )	Ú
udbase_dirr   r   ÚdatasetrØ   Ú
gsd_conlluÚkaist_conllur   Ú	segmenterr   r   r   Úbuild_combined_korean_datasetø  s   	
ræ   c                 C   s,   dD ]}t  |||¡}t| ||||ƒ qd S ©N©ÚtrainÚdevÚtest)r(   Útokenizer_conllu_nameræ   )rá   r   r   râ   rØ   r   r   r   Úbuild_combined_korean  s   þrí   c           	      C   s˜   | d }|dkr>ddg}|t jjur| ddg¡ td |d |¡¡ƒ g }|D ]}t j|||d	d
d}| t|ƒ¡ q(|S t  d||d	¡}t|ƒ}|S )NÚUDBASEré   zUD_Italian-ISDTzUD_Italian-VITzUD_Italian-TWITTIROzUD_Italian-PoSTWITAzBuilding {} dataset out of {}r`   rÞ   T©Úfail)	r(   r)   Ú	TOKENIZERÚextendr   r²   rc   rà   r   )	r.   r-   râ   rá   Ú	treebanksr   r,   Úconllu_fileÚistd_conllur   r   r   Úbuild_combined_italian_dataset  s&   þþýrö   c                 C   s&   t  d| dd¡}t  |¡rtdƒ‚d S )NúUD_English-GUMRedditré   rÞ   zìCannot process UD_English-GUMReddit in its current form.  There should be a download script available in the directory which will help integrate the missing proprietary values.  Please run that script to update the data, then try again.)r(   rà   Úmostly_underscoresrQ   )rá   Ú
gum_conllur   r   r   Úcheck_gum_ready-  s   
ÿrú   c                 C   sÔ   | d }t |ƒ |dkr\g d¢}ddg}g }|D ]}tj||dddd}t|ƒ}	td	t|	ƒ|f ƒ | |	¡ q|D ]}tj||d
ddd}t|ƒ}	td	t|	ƒ|f ƒ | |	¡ q:|S t d||d¡}
t|
ƒ}|S )zJ
    en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed
    rî   ré   )úUD_English-EWTúUD_English-GUMr÷   zUD_English-PUDzUD_English-PronounsrÞ   Trï   úRead %d sentences from %srë   rû   )rú   r(   rà   r   r   r9   rò   )r.   r-   râ   rá   Útrain_treebanksÚtest_treebanksr   r,   rô   rf   Ú
ewt_conllur   r   r   Úbuild_combined_english_dataset2  s(   ýr  c           	      C   sà   g }| D ]i}d}d}d}|D ]+}|  d¡rq| d¡}t |d ¡r#q|d dkr-|d }t|d ƒ}|d d	k}q|sht|ƒ}|d
  d¡}t|d
 ƒ|d
< d |¡|d
< | d|d ||f ¡ | |¡ q| |¡ q|S )a  
    Add a period to the end of a sentence with no punct at the end.

    The next-to-last word has SpaceAfter=No added as well.

    Possibly English-specific because of the xpos.  Could be upgraded
    to handle multiple languages by passing in the xpos as an argument
    NFrx   ra   r   r¿   Ú0rX   ÚPUNCTr[   z$%d	.	.	PUNCT	.	_	%s	punct	%s:punct	_r4   )	rN   r   r	   r~   r8   rb   rU   rc   rd   )	Úhandparsed_sentencesrf   r•   Úroot_idÚmax_idÚ
last_punctr–   rj   ri   r   r   r   Ú&add_english_sentence_final_punctuationQ  s0   	

r  c                 C   sZ   | d }g }|dkr+|t jju r+tj |dd¡}t|ƒ}tdt|ƒ|f ƒ | 	|¡ |S )z[
    Extra sentences we don't want augmented for French - currently, handparsed lemmas
    ÚHANDPARSED_DIRré   zfrench-lemmaszfr_lemmas.conlluúLoaded %d sentences from %s)
r(   r)   ÚLEMMAr   Úpathrc   r   r   r9   rò   ©r.   r-   râ   Úhandparsed_dirr   Úhandparsed_pathr  r   r   r   Ú#build_extra_combined_french_datasett  s   
r  c                 C   s˜   | d }g }|dkrJt j |dd¡}t|ƒ}t|ƒ}| |¡ tdt|ƒ|f ƒ |tj	j
u rJt j |dd¡}t|ƒ}tdt|ƒ|f ƒ | |¡ |S )z1
    Extra sentences we don't want augmented
    r	  ré   zenglish-handparsedzenglish.conllr
  zenglish-lemmaszen_lemmas.conllu)r   r  rc   r   r  rò   r   r9   r(   r)   r  r  r   r   r   Ú$build_extra_combined_english_datasetƒ  s   

r  c                 C   s¤   | d }|dkr
g S t j |dd¡}t j |¡std |¡ƒ‚t|ƒ}|D ] }|d  d¡r5t 	|d ¡s9t
dƒ‚|d d	d
… d |d< q%tdt|ƒ|f ƒ |S )z/
    Extra data - the MWT data for Italian
    r	  ré   zitalian-mwtzitalian.mwtzgCannot find the extra dataset 'italian.mwt' which includes various multi-words retokenized, expected {}rZ   r   zhUnexpected format of the italian.mwt file.  Has it already be modified to have SpaceAfter=No everywhere?Nr[   rC   r
  )r   r  rc   ÚexistsÚFileNotFoundErrorr²   r   rp   r   r~   r‘   r   r9   )r.   r-   râ   r  Úextra_italianÚextra_sentsrg   r   r   r   Ú$build_extra_combined_italian_dataset—  s   r  c                 C   sª   g }d}| D ]F}t |ƒD ]\}}| d¡r nqtdƒ‚| d¡s'| |¡ q|d }t|ƒ}|dd… d ||< |d  dd¡|d< |d }| |¡ qtd	| ƒ |S )
af  
    Spanish GSD and AnCora have different standards for semicolons.

    GSD has semicolons at the end of sentences, AnCora has them in the middle as clause separators.
    Consecutive sentences in GSD do not seem to be related, so there is no combining that can be done.
    The easiest solution is to replace sentence final semicolons with "." in GSD
    r   rY   z5Expected every sentence in GSD to have a # text fieldú;r4   Nr[   r   z7Updated %d sentences to replace sentence-final ; with .)rq   rN   rQ   rp   rd   rb   rO   r   )Ú	sentencesrf   rÎ   rg   r}   rh   ri   r   r   r   Úreplace_semicolons«  s&   
ÿ

r  c                 C   sb   g }| D ]*}g }|D ]}|  d¡r| |¡ q
| d¡}d||< | d |¡¡ q
| |¡ q|S )z
    Removes a specified column from the given dataset

    Particularly useful when mixing two different POS formalisms in the same tagger
    rx   ra   r   )rN   rd   r   rc   )r   Úcolumnrf   rg   ri   ru   rj   r   r   r   Ústrip_columnÇ  s   


r  c                 C   ó
   t | dƒS )z†
    Removes all xpos from the given dataset

    Particularly useful when mixing two different POS formalisms in the same tagger
    rW   ©r  ©r   r   r   r   Ú
strip_xposÚ  ó   
r  c                 C   r  )zŠ
    Removes all features from the given dataset

    Particularly useful when mixing two different POS formalisms in the same tagger
    é   r  r  r   r   r   Ústrip_featsâ  r   r"  c                 C   s`  | d }| d }| d }ddg}|dkrR|t jjkrRi }t j|d |ddd	d
}t|ƒ}	|	||d < t j|d |ddd	d
}t|ƒ}	t|	ƒ}	t|	ƒ}	|	||d < |S |dkrž|t jjuržg }
t j|d |ddd	d
}t|ƒ}	tdt	|	ƒ|f ƒ |
 
|	¡ t j|d |ddd	d
}t|ƒ}	tdt	|	ƒ|f ƒ |
 
|	¡ |
S t j|d ||dd	d
}t|ƒ}
|
S )zI
    sq_combined is STAF as the base, with TSA added for some things
    rî   Ú
UDBASE_GITr	  zUD_Albanian-STAFzUD_Albanian-TSAré   r   rÞ   Trï   r4   rë   rý   )r(   r)   r*   rà   r   r  r"  r+   r   r9   rò   )r.   r-   râ   rá   Úudbase_git_dirr  ró   Ú	documentsrô   rf   r   r   r   r   Úbuild_combined_albanian_datasetê  s:   

r&  c                 C   sd  | d }| d }ddg}|dkr:|t jjkr:i }|D ]}t j|||ddd}t|ƒ}	| d	¡s3t|	ƒ}	|	||< q|S |dkr¢g }
|D ](}t j|||ddd}t|ƒ}	td
t|	ƒ|f ƒ | d¡ret	|	ƒ}	|
 
|	¡ qB|t jjt jjt jjfv r tj |dd¡}tj |¡std t¡ƒ‚t|ƒ}td
t|ƒ|f ƒ |
 
|¡ |
S t jd||ddd}t|ƒ}
|
S )aô  
    es_combined is AnCora and GSD put together

    For POS training, we put the different datasets into a zip file so
    that we can keep the conllu files separate and remove the xpos
    from the non-AnCora training files.  It is necessary to remove the
    xpos because GSD and PUD both use different xpos schemes from
    AnCora, and the tagger can use additional data files as training
    data without a specific column if that column is entirely blank

    TODO: consider mixing in PUD?
    rî   r	  zUD_Spanish-AnCorazUD_Spanish-GSDré   rÞ   Trï   ÚAnCorarý   ÚGSDzspanish-mwtzadjectives.conlluzjCannot find the extra dataset 'handpicked.mwt' which includes various multi-words retokenized, expected {})r(   r)   r*   rà   r   rp   r  r   r9   r  rò   rñ   ÚMWTr  r   r  rc   r  r  r²   r  )r.   r-   râ   rá   r  ró   r%  r,   rô   rf   r   Úextra_spanishr  r   r   r   Úbuild_combined_spanish_dataset  s@   



ýr+  c                 C   sØ   | d }| d }|dkr^g d¢}g }|D ]}t j||dddd}t|ƒ}	tdt|	ƒ|f ƒ | |	¡ qtj |d	d
¡}
tj 	|
¡sIt
d t¡ƒ‚t|
ƒ}tdt|ƒ|
f ƒ | |¡ |S t  d||d¡}t|ƒ}|S )Nrî   r	  ré   )úUD_French-GSDzUD_French-ParisStorieszUD_French-RhapsodiezUD_French-SequoiarÞ   Trï   rý   zfrench-handparsedzhandparsed_deps.conlluzkCannot find the extra dataset 'handparsed_deps.conllu' which includes various dependency fixes, expected {}r,  )r(   rà   r   r   r9   rò   r   r  rc   r  r  r²   r  )r.   r-   râ   rá   r  rþ   r   r,   rô   rf   Úextra_frenchr  rã   r   r   r   Úbuild_combined_french_datasetK  s(   
ýr.  c                 C   s  | d }| d }ddg}|dkrqg }|D ]}t j|||ddd}t|ƒ}	td	t|	ƒ|f ƒ | |	¡ qtj |d
¡}
tj 	|
¡sGt
d|
 ƒ‚tj |
d¡}tj 	|¡s\t
d|
|f ƒ‚t|ƒ}	td	t|	ƒ|f ƒ | |	¡ |S t j|d ||ddd}t|ƒ}|S )a#  
    Combines the IAHLT treebank with an updated form of HTB where the annotation style more closes matches IAHLT

    Currently the updated HTB is not in UD, so you will need to clone
    git@github.com:IAHLT/UD_Hebrew.git to $UDBASE_GIT

    dev and test sets will be those from IAHLT
    rî   r#  zUD_Hebrew-IAHLTwikizUD_Hebrew-IAHLTknessetré   rÞ   Trï   rý   Ú	UD_HebrewzOPlease download git@github.com:IAHLT/UD_Hebrew.git to %s (based on $UDBASE_GIT)zhe_htb-ud-train.conlluz)Found %s but inexplicably there was no %sr   )r(   rà   r   r   r9   rò   r   r  rc   r  r  )r.   r-   râ   rá   r$  ró   r   r,   rô   rf   Úhebrew_git_dirr   r   r   Úbuild_combined_hebrew_datasetc  s.   	
ýr1  )Úen_combinedÚes_combinedÚfr_combinedÚhe_combinedÚit_combinedÚsq_combined)r2  r4  r6  c                 C   s€  t  d¡ | d }t| }t |d ¡}dD ]¨}t |||¡}|| ||ƒ}	t|	tƒr¡|dkr@|r@t	|	 
¡ ƒD ]
}
t|	|
 ƒ|	|
< q5tj |¡d d }t |d¡H}t	|	 
¡ ƒD ]9}
| |
d d¡'}tj|d	d
d}t||	|
 ƒ W d   ƒ n1 s|w   Y  W d   ƒ n1 s‹w   Y  qWW d   ƒ n1 s›w   Y  q|dkr«|r«t|	ƒ}	|d ur¸|	 || ||ƒ¡ t||	ƒ qd S )Nr3   r   rè   ré   r   z.zipÚwr   zutf-8rM   )ÚencodingÚnewline)r5   r6   ÚCOMBINED_FNSÚCOMBINED_EXTRA_FNSÚgetr(   rì   Ú
isinstancer"   rb   ÚkeysrÖ   r   r  ÚsplitextÚzipfileÚZipFileÚopenÚioÚTextIOWrapperr   rò   r   )r.   r   r-   r%   r   Úbuild_fnÚextra_fnrâ   rØ   r   ÚfilenameÚ
output_zipÚzoutÚzfoutÚfoutr   r   r   Úbuild_combined_dataset˜  s<   

ÿ€ÿ€ÿÿ€îrM  )Úen_craftÚen_geniaÚen_mimicc                 C   s®   t  d¡ | d¡\}}|dksJ ‚dD ]@}	t |||	¡}
|	dkr2t| ||	ƒ}|	dkr1|r1t|ƒ}ng }tj 	| d d| 
¡  d| ¡ |	f ¡}| t|ƒ¡ t|
|ƒ qd	S )
zw
    Process the en bio datasets

    Creates a dataset by combining the en_combined data with one of the bio sets
    r3   r   Úenrè   ré   Ú
BIO_UD_DIRzUD_English-%szen_%s-ud-%s.conlluN)r5   r6   r   r(   rì   r  rÖ   r   r  rc   ÚupperÚlowerrò   r   r   )r.   rá   r   r  r   r-   r%   ÚnameÚbio_datasetrâ   rØ   r   Úbio_filer   r   r   Úbuild_bio_dataset³  s   
€(örX  c           
      C   sz   t | ƒ t d¡ t |||¡}ddg}g }|D ]}tj|| |ddd}	| t|	ƒ¡ q|dkr6|r6t|ƒ}t	||ƒ dS )	z„
    Build the GUM dataset by combining GUMReddit

    It checks to make sure GUMReddit is filled out using the included script
    r3   rü   r÷   rÞ   Trï   ré   N)
rú   r5   r6   r(   rì   rà   rò   r   rÖ   r   )
rá   r   r   râ   r%   rØ   ró   r   r,   rô   r   r   r   Ú"build_combined_english_gum_datasetÈ  s   
rY  c                 C   s   dD ]
}t | ||||ƒ qd S rç   )rY  )rá   r   r   r%   râ   r   r   r   Úbuild_combined_english_gumÞ  s   ÿrZ  c	           
      C   sÞ   |d u rt j| ||ddd}|d u rt  |||¡}td||f ƒ |dkr3|dkr3|r3t||tƒ d S |dkrE|dkrE|rEt||tƒ d S | d¡rV| d	¡rVt	||ƒ d S |dkrd|rdt||t
ƒ d S t|ƒ}	t||	ƒ d S )
NrÞ   Trï   z!Reading from %s and writing to %sÚte_mtgré   Úar_padtÚko_rß   )r(   rà   rì   r   rÚ   rv   rk   rN   rp   rÝ   rÖ   r   r   )
r,   rá   r   r   r0   râ   r%   r×   rØ   r   r   r   r   Úprepare_ud_datasetâ  s   r^  c                 C   s@   t | ||||d|ƒ t | ||||d|ƒ t | ||||d|ƒ dS )zš
    Process a normal UD treebank with train/dev/test splits

    SL-SSJ and other datasets with inline modifications all use this code path as well.
    ré   rê   rë   N)r^  )r,   rá   r   r   r0   r%   r   r   r   Úprocess_ud_treebankõ  s   r_  c           
      C   s¢   t  | |dd¡}t  | |dd¡}t  ||d¡}t  ||d¡}t  ||d¡}	t  |¡dkr8t  |¡dkr8||}}t| |||dsBdS t| ||||dd	||	d
	 dS )a…  
    Process a UD treebank with only train/test splits

    For example, in UD 2.7:
      UD_Buryat-BDT
      UD_Galician-TreeGal
      UD_Indonesian-CSUI
      UD_Kazakh-KTB
      UD_Kurmanji-MG
      UD_Latin-Perseus
      UD_Livvi-KKPP
      UD_North_Sami-Giella
      UD_Old_Russian-RNC
      UD_Sanskrit-Vedic
      UD_Slovenian-SST
      UD_Upper_Sorbian-UFAL
      UD_Welsh-CCG
    ré   rÞ   rë   rê   iè  iˆ  )r,   r;   r<   r=   NF)r%   r×   rØ   )r(   rà   rì   Únum_words_in_filerB   r^  )
r,   rá   r   r   r0   r;   Útest_input_conllur<   r=   Útest_output_conllur   r   r   Úprocess_partial_ud_treebank  s   
ýrc  c                 C   s@   | j dddddd | j ddddd	d t | ¡ t | ¡ d S )
Nz--no_augmentÚstore_falser%   Tz#Augment the dataset in various ways)ÚactionÚdestÚdefaultÚhelpz--no_prepare_labelsr&   z`Prepare tokenizer and MWT labels.  Expensive, but obviously necessary for training those models.)Úadd_argumentÚconvert_th_lst20Úadd_lst20_argsÚconvert_vi_vlspÚadd_vlsp_args)Úparserr   r   r   Úadd_specific_args*  s   ÿÿ
ro  c           
      C   s
  |d }|d }|d }t | ƒ}| d¡d }tj|dd |dkr+t |d	 |¡ n³|d
kr9t |d ||¡ n¥|dkrFt |d |¡ n˜|dkrTt 	|d ||¡ nŠ|dkrat
 |d |¡ n}|dkrnt |d |¡ np| d¡rzt|||ƒ nd|tv r‡t||||jƒ nW|tv r—t|||||||jƒ nG| d¡r®td| ||f ƒ t||||jƒ n0tj| |dddd}	td| ||f ƒ tj| |ddddsÔt| ||||ƒ n
t| |||||jƒ |tjju së|tjju r|dvrõt ||¡ |jrt ||¡ dS dS dS )a&  
    Processes a single treebank into train, dev, test parts

    Includes processing for a few external tokenization datasets:
      vi_vlsp, th_orchid, th_best

    Also, there is no specific mechanism for UD_Arabic-NYUAD or
    similar treebanks, which need integration with LDC datsets
    rî   r   r	  r   r   Tr   Úmy_altÚCONSTITUENCY_BASEÚvi_vlspÚSTANZA_EXTERN_DIRÚ	th_orchidÚth_lst20Úth_bestÚ	ml_cochinÚko_combinedÚen_gumzPreparing data for %s: %s, %sré   rÞ   rï   rê   F)rt  ru  N)r   r   r   r   Úconvert_my_altrl  Úconvert_th_orchidÚmainrj  ÚconvertÚconvert_th_bestÚconvert_ml_cochinrN   rí   r;  rM  r%   ÚBIO_DATASETSrX  r   rZ  r(   rà   rc  r_  r)   rñ   r)  Úconvert_conllu_to_txtr&   Ú!prepare_tokenizer_treebank_labels)
r,   r-   r.   r1   rá   r   r  r   r0   Útrain_conllu_filer   r   r   r'   3  sN   


ûr'   c                   C   s   t  tt jjt¡ d S )N)r(   r|  r'   r)   rñ   ro  r   r   r   r   r|  o  s   r|  Ú__main__)NT)T)rV   )rw   )r„   )r§   )rË   )rn   )TNN)aÚ__doc__r#   ÚglobrD  r   r5   Úrer    rA  Úcollectionsr   Ústanza.models.common.constantr   Ústanza.utils.datasets.commonÚutilsÚdatasetsr(   r   r   r   r   r   r	   Ú4stanza.utils.datasets.tokenization.convert_ml_cochinÚtokenizationr  Ú1stanza.utils.datasets.tokenization.convert_my_altrz  Ú2stanza.utils.datasets.tokenization.convert_vi_vlsprl  Ú2stanza.utils.datasets.tokenization.convert_th_bestr~  Ú3stanza.utils.datasets.tokenization.convert_th_lst20rj  Ú4stanza.utils.datasets.tokenization.convert_th_orchidr{  r   r2   rB   rK   rS   rU   rk   rv   Úcompilerz   rƒ   r   r—   r›   r¨   rc   r°   r«   r¬   r»   r¼   rÄ   rÊ   rÐ   rÕ   rÖ   rÚ   rÜ   rÝ   ræ   rí   rö   rú   r  r  r  r  r  r  r  r  r"  r&  r+  r.  r1  r;  r<  rM  r€  rX  rY  rZ  r^  r_  r:   rc  ro  r'   r|  Ú__name__r   r   r   r   Ú<module>   s²     

"
	


/
-

GT--&
@
	
#
.)#-4&úý

(	<

ÿ