o
    hh=                     @   s   d Z ddlmZ ddlZddlZddlZddlmZ ddlm	  m
  m  mZ dZdd Zdd	 Zd
eefddZdd Zdd Zd
eefddZdd Zd
eefddZd,ddZdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ ZdS )-z
Utils for the processing of NER datasets

These can be invoked from either the specific dataset scripts
or the entire prepare_ner_dataset.py script
    )defaultdictN)Document)traindevtestc                 C   sx   g }d}| D ]3}|dkr| | d}q|r,|ds |dr,| d|dd    q| d|dd    d}q|S )NFOB-S-   I-T)append
startswith)tagsnew_tags	in_entitytag r   Z/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/ner/utils.pybioes_to_bio   s   
r   c           
      C   s   t D ]E}tj| d||f }tj|d||f }t|ddd}g }|D ]}dd |D }	t|	}	dd t||	D }|| q%t|| qd	S )
z
    Convert BIOES files back to BIO (not BIO2)

    Useful for preparing datasets for CoreNLP, which doesn't do great with the more highly split classes
    z%s.%s.bioesz	%s.%s.bior      )text_columnannotation_columnc                 S   s   g | ]}|d  qS )r   r   .0xr   r   r   
<listcomp>/       z(convert_bioes_to_bio.<locals>.<listcomp>c                 S   s   g | ]
\}}|d  |fqS )r   r   )r   r   yr   r   r   r   1       N)	SHARDSospathjoinread_tsvr   zipr   write_sentences)
base_input_pathbase_output_path
short_nameshardinput_filenameoutput_filenameinput_sentencesnew_sentencessentencer   r   r   r   convert_bioes_to_bio"   s   r/   bioc                 C   s   t ||D ]M\}}tj| d|||f }tj|s9tj| d||f }	tj|	r/|	}n
td||||	f tj|d||f }
td||
f  t||
 qdS )a  
    Convert BIO files to json

    It can often be convenient to put the intermediate BIO files in
    the same directory as the output files, in which case you can pass
    in same path for both base_input_path and base_output_path.

    This also will rewrite a BIOES as json
    %s.%s.%sz%s.%sz*Cannot find %s component of %s in %s or %s
%s.%s.jsonzConverting %s to %sN)	r$   r    r!   r"   existsFileNotFoundErrorprintprepare_ner_fileprocess_dataset)r&   r'   r(   suffixshard_namesshardsinput_shardoutput_shardr*   alt_filenamer+   r   r   r   convert_bio_to_json6   s   
r>   c                 C   s6   t  }| D ]}|D ]}|D ]	\}}|| qq	q|S )z~
    return the set of tags used in these datasets

    datasets is expected to be train, dev, test but could be any list
    )setadd)datasetsr   datasetr.   wordr   r   r   r   get_tagsL   s   rD   c                 C   s   t jt j| d dd t| dddE}t|D ]7\}}t|D ])\}}t|dkr1|dd }z	|d	|  W q! tyJ   td
||| f w |d qW d   dS 1 s\w   Y  dS )z8
    Write exactly one output file worth of dataset
    r   T)exist_okwutf-8encodingr
   Nz%s	%s
z0Unable to process sentence %d word %d of file %s
)	r    makedirsr!   splitopen	enumeratelenwrite	TypeError)r+   rB   foutsent_idxr.   word_idxrC   r   r   r   r%   Y   s   "r%   c           	      C   sN   t || D ]\}}tj|d|||f }t|| qt||||||d dS )a  
    write all three pieces of a dataset to output_dir

    datasets should be 3 lists: train, dev, test
    each list should be a list of sentences
    each sentence is a list of pairs: word, tag

    after writing to .bio files, the files will be converted to .json
    r1   )r9   r:   N)r$   r    r!   r"   r%   r>   	rA   
output_dirr(   r8   r9   r:   r)   rB   r+   r   r   r   write_dataseti   s   
rW   c                 C   s   g }|D ]}g }|D ]}|d |d |d d}| | q
| | qt| ddd}tj||dd W d    d S 1 s?w   Y  d S )	Nr   r   r
   )textner	multi_nerrF   rG   rH   indent)r   rM   jsondump)r+   rB   json_datasetr.   json_sentencerC   rR   r   r   r   write_multitag_jsonz   s   "ra   c           	      C   sn   t || D ]\}}tj|d|||f }t|| qt || D ]\}}tj|d||f }t|| q d S )Nr1   r2   )r$   r    r!   r"   r%   ra   rU   r   r   r   write_multitag_dataset   s   rb   TF	c                 C   sf  t | dd}| }	W d   n1 sw   Y  dd |	D }	g }
g }t|	D ]\}}|s9|r8|
| g }q)|rA|drAq)||}z|| }W n tyb } z
td|||f |d}~ww |dkrhq)z|| }W n! ty } z|r{d}n
td	|||f |W Y d}~nd}~ww |r||}|r|||< || q)|||f q)|r|
| |
S )
z
    Read sentences from a TSV file

    Returns a list of list of (word, tag)

    If keep_broken_tags==True, then None is returned for a missing.  Otherwise, an IndexError is thrown
    rG   rH   Nc                 S   s   g | ]}|  qS r   )stripr   r   r   r   r      r   zread_tsv.<locals>.<listcomp>#z,Could not find word index %d at line %d |%s|   z+Could not find tag index %d at line %d |%s|)rM   	readlinesrN   r   r   rL   
IndexError)filenamer   r   remap_fnskip_commentskeep_broken_tagskeep_all_columns	separatorfinlines	sentencescurrent_sentenceline_idxlinepiecesrC   er   r   r   r   r#      sR   



r#   c                 C   s$   t | }t|}t| ||| d S )N)r    listdirsortedrandom_shuffle_files)	input_dirrV   r(   input_filesr   r   r   random_shuffle_directory   s   
r|   c              	   C   sv  i }|D ]}| dd }||v rtd|| |f |||< qt|t|ks*J g }g }g }	|D ]0}
|
 dd }|d }t|d t }|dk rS||
 q2|dk r]||
 q2|	|
 q2tdt|t|t|	f  t|t| t|	 t|ksJ |||	g}g }|D ]}g }|D ]}
|tt	j
| |
dd	 q|| qt||| t|t|t|	fS )
aU  
    Shuffle the files into different chunks based on their filename

    The first piece of the filename, split by ".", is used as a random seed.

    This will make it so that adding new files or using a different
    annotation scheme (assuming that's encoding in pieces of the
    filename) won't change the distibution of the files
    .r   z.Multiple files with the same prefix: %s and %sz.txt.4class.tsvr
   gffffff?g?z.Train files: %d  Dev files: %d  Test files: %dr   )rL   
ValueErrorrO   randomseedr   r5   extendr#   r    r!   r"   rW   )rz   r{   rV   r(   
input_keysfr   train_files	dev_files
test_filesri   location
file_listsrA   filesrB   r   r   r   ry      s>   

$
ry   c                    s  t | }t|}tt}|D ]&}| D ]}|| D ]	}||r$ nqq ntd| || | qd}	d}
d}| D ]-}t	  t	dt
|| |f  t| || |d |f \}}}|	|7 }	|
|7 }
||7 }q@t	  t	d|	|
|f   fdd|D }t|||  d S )Nz=Could not assign %s to any of the divisions in the prefix_mapr   zProcessing %d files from %s%s-%sz?After shuffling: Train files: %d  Dev files: %d  Test files: %dc                    s   g | ]}d  |f qS )r   r   )r   divisionr(   r   r   r     s    z.random_shuffle_by_prefixes.<locals>.<listcomp>)r    rw   rx   r   listkeysr   r~   r   r5   rO   ry   combine_dataset)rz   rV   r(   
prefix_mapr{   file_divisionsri   r   prefixnum_train_filesnum_dev_filesnum_test_filesd_traind_devd_testdataset_divisionsr   r   r   random_shuffle_by_prefixes   s6   

 
r   c              
   C   s   g }t D ]B}g }|D ]6}d||f }tj| |}	t|	dd}
t|
}dd |D }|| W d    n1 s;w   Y  q
|| qt	||| d S )Nr2   rG   rH   c                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]
}|d  |d fqS )rX   rY   r   )r   rC   r   r   r   r   '  r   z.combine_dataset.<locals>.<listcomp>.<listcomp>r   )r   r.   r   r   r   r   '  s    z#combine_dataset.<locals>.<listcomp>)
r   r    r!   r"   rM   r]   loadr   r   rW   )rz   rV   input_datasetsoutput_datasetrA   r)   full_datasetinput_datasetr*   
input_pathro   rB   	convertedr   r   r   r     s   
r   c                 C   s  d}t  }g }i }t| ddm}|D ]I}| }|drq|s!q|dr?|dur.|||< g }|dd   dd}q|sGtd	|  || ||v rVtd
| |	| q|rl|rt|||< W d   |S W d   |S W d   |S 1 sw   Y  |S )z
    Read a prefix file such as the one for the Worldwide dataset

    the format should be

    africa:
    af_
    ...

    asia:
    cn_
    ...
    NrG   rH   re   : _zBFound a prefix before the first label was assigned when reading %szFound the same prefix twice! %s)
r?   rM   rd   r   endswithlowerreplaceRuntimeErrorr   r@   )destination_filedestinationknown_prefixesprefixesr   ro   rt   r   r   r   read_prefix_file,  sD   






r   c                 C   sB   t | }tt|}t|W  d   S 1 sw   Y  dS )zj
    Read entities from a file, return a list of (text, label)

    Should work on both BIOES and BIO
    N)rM   r   r]   r   list_doc_entities)ri   ro   docr   r   r   read_json_entitiesX  s   
$r   c                 C   s  g }| j D ]}g }d}|jD ]}|jdks|jdr;|jdr'||j |r:|dus/J |||f g }d}n||jdrq|durj|dkrj||jdd krj|rj|dusZJ |||f g }|jdd }||j nF|jds}|jdr|r|dusJ |||f g }d}||j |jdd }|jdr|dusJ || g }d}ntd|jdd }q|r|dusJ |||f qd	d
 |D }|S )zO
    Return a list of (text, label)

    Should work on both BIOES and BIO
    Nr   zE-r   r
   r   r	   z)Expected BIO(ES) format in the json file!c                 S   s    g | ]}t |d  |d fqS )r   r   )tupler   r   r   r   r     s     z%list_doc_entities.<locals>.<listcomp>)rq   tokensrY   r   r   rX   r   )r   entitiesr.   current_entityprevious_labeltokenr   r   r   r   c  sX   

"
r   c              	   G   s   g }|D ] }t |}t|}|| W d   n1 sw   Y  qt | d}tj||dd W d   dS 1 s>w   Y  dS )z;
    Combine multiple NER json files into one NER file
    NrF   r
   r[   )rM   r]   r   r   r^   )r+   input_filenamesr   ri   ro   new_docrR   r   r   r   combine_files  s   

"r   )NTFFrc   )__doc__collectionsr   r]   r    r   stanza.models.common.docr   *stanza.utils.datasets.ner.prepare_ner_fileutilsrA   rY   r6   r   r   r/   r>   rD   r%   rW   ra   rb   r#   r|   ry   r   r   r   r   r   r   r   r   r   r   <module>   s2    
	52",1