o
    hq$                     @   s2  d Z ddlZddlZddlZddlmZ ddlZddlmZ ddlZddl	m
Z
 ddlm  m  m  mZ dd Zdd	 Zd
d Zdd Zdd Zdd Zedkre Zejdeddd ejdeddd ejdeddd ejdeddd ejdedd d ejd!edd"d e Zee dS dS )#a6  
Converts a .json file from AMT to a .bio format and then a .json file

To ignore Facility and Product, turn NORP into miscellaneous:

 python3 stanza/utils/datasets/ner/convert_amt.py --input_path /u/nlp/data/ner/stanza/en_amt/output.manifest --ignore Product,Facility --remap NORP=Miscellaneous

To turn all labels into the 4 class used in conll03:

  python3 stanza/utils/datasets/ner/convert_amt.py --input_path /u/nlp/data/ner/stanza/en_amt/output.manifest --ignore Product,Facility --remap NORP=MISC,Miscellaneous=MISC,Location=LOC,Person=PER,Organization=ORG
    N)
itemgetter)tqdm)write_sentencesc                 C   sn  g }d}d}d}t | dd}t|D ]\}}t|}t| dgkr*|d7 }qd|vr3|d7 }q|d }	d}
| D ]0}|dksH|drIq=d|| vrPq=|| d }d	|vr[q=d	|v rm|
duritd
| |d	 }
q=|
du rw|d7 }qtdd |
D }|r|d7 }|dkrt	d t	|
 |
|	|
f qW d   n1 sw   Y  t	dt||||f  |S )a  
    Read the json file and extract the NER labels

    Will not return lines which are not labeled

    Return format is a list of lines
    where each line is a tuple: (text, labels)
    labels is a list of maps, {'label':..., 'startOffset':..., 'endOffset':...}
    r   zutf-8)encodingsource   Nmetadataannotationsentitiesz0Found a map with multiple annotations at line %dc                 3   s&    | ] t  fd ddD V  qdS )c                 3   s    | ]}| vV  qd S )N .0xentityr   `/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/ner/convert_amt.py	<genexpr>F   s    z&read_json.<locals>.<genexpr>.<genexpr>)labelstartOffset	endOffsetN)any)r   r   r   r   r   F   s    zread_json.<locals>.<genexpr>zIFound an entity which was missing either label, startOffset, or endOffsetz_Found %d labeled lines.  %d lines were blank, %d lines were broken, and %d lines were unlabeled)open	enumeratejsonloadssortedkeysendswith
ValueErrorr   printappendlen)input_filenamedocsblank	unlabeledbrokenfinline_idxlinedocr   r
   kr	   	is_brokenr   r   r   	read_json   sX   

(r-   c                    s,    s| S t  d  fdd| D }|S )N,c                    s*   g | ]}|d   fdd|d D fqS )r   c                    s   g | ]
}|d   vr|qS )r   r   r   ignoredr   r   
<listcomp>Y       z4remove_ignored_labels.<locals>.<listcomp>.<listcomp>r   r   )r   r*   r/   r   r   r1   Y   s    "z)remove_ignored_labels.<locals>.<listcomp>)setsplit)r#   r0   new_docsr   r/   r   remove_ignored_labelsR   s   
r6   c           
      C   s   |s| S i }| dD ]}| d}|d ||d < qt| g }| D ]%}t|d }|D ]}||d |d |d< q.|d |f}	||	 q#|S )Nr.   =r   r   r   )r4   r   copydeepcopygetr    )
r#   remap
remappings	remappingpiecesr5   r*   r
   r   new_docr   r   r   remap_labels]   s   
r@   c                 C   s   g }d}d}d}| D ]P}|\}}t |dd d}g }|D ]6}	|d7 }t|D ]&}
|	d |
d kr>|	d |
d kr>|d7 } n|	d |
d k rL|d7 } nq&||	 q|||f q
td|||f  |S )	z
    Currently the NER tool does not handle nesting, so we just throw away nested entities

    In the event of entites which exactly overlap, the first one in the list wins
    r   c                 S   s   | d | d  fS )Nr   r   r   )r   r   r   r   <lambda>~   s    z remove_nesting.<locals>.<lambda>)keyr   r   r   z7Ignored %d exact and %d nested labels out of %d entries)r   reversedr    r   )r#   r5   nestedexacttotalr*   r   labels
new_labelsr   otherr   r   r   remove_nestingq   s.    
rJ   c                 C   s`  || }|j }|D ]}|jD ]}d|_qq	|D ]}|d }|d }	|d }
|D ]}|jd j|	kr;|jd j|
kr; nq'qd}d}t|jD ]U\}}|j|	kr\|j|	kr\|}d| |_n2|dur|j|
krr|dkrr|j|d	  } n*|j|
kr|dkr|jd
v r|j|d	  } nd| |_|j|
kr|du r|} nqF|du s|du rtdqdd |D S )zt
    Given a source text and a list of labels, tokenize the text, then assign labels based on the spans defined
    Or   r   r   r   NzB-r   )r.   .zI-zThis should not happenc                 S   s   g | ]
}d d |j D qS )c                 S   s   g | ]}|j |jfqS r   )textner)r   tokenr   r   r   r1      s    z*process_doc.<locals>.<listcomp>.<listcomp>)tokens)r   sentencer   r   r   r1      r2   zprocess_doc.<locals>.<listcomp>)	sentencesrQ   rO   
start_charend_charr   rN   AssertionError)r   rG   piper*   rS   rR   rP   r   rO   start_offset
end_offsetstart_token	end_token	token_idxr   r   r   process_doc   sL   

r]   c                 C   s   t | j}t|dkrtd dS t|| j}t|| j}t|}t	j
| jdd}g }t|D ]}|tg ||R   q/tdt|  | j}t| j| td| j  |drd|dd	 d
 }n|d
 }t|| dS )z
    Read in a .json file of labeled data from AMT, write out a converted .bio file

    Enforces that there is only one set of labels on a sentence
    (TODO: add an option to skip certain sets of labels)
    r   z,Error: no documents found in the input file!Ntokenize)
processorszUFound %d total sentences (may be more than #docs if a doc has more than one sentence)zSentences written to %sz.bioz.json)r-   
input_pathr!   r   r6   ignorer@   r;   rJ   stanzaPipelinelanguager   extendr]   output_pathr   r   prepare_ner_fileprocess_dataset)argsr#   rW   rS   r*   bio_filenamejson_filenamer   r   r   main   s&   

rm   __main__z
--languageenzLanguage to process)typedefaulthelpz--input_pathzoutput.manifestzWhere to find the filesz--output_pathzdata/ner/en_amt.test.biozWhere to output the resultsz--json_output_pathzIWhere to output .json.  Best guess will be made if there is no .json filez--ignorez:Ignore these labels: comma separated list without B- or I-z--remapz)Remap labels: comma separated list of X=Y)__doc__argparser8   r   operatorr   sysr   rc   stanza.utils.datasets.ner.utilsr   *stanza.utils.datasets.ner.prepare_ner_fileutilsdatasetsrO   rh   r-   r6   r@   rJ   r]   rm   __name__ArgumentParserparseradd_argumentstr
parse_argsrj   r   r   r   r   <module>   s6    9!0 