o
    h+                     @   s   d Z ddlZddlmZ edZdZeeedg Z	dd Z
d	d
 Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )z1
Utility functions for dealing with NER tagging.
    N)EMPTYstanza)_- NOc                 C   s0   | D ]}t |dkr|dd dv r dS qdS )z
    Check if a basic tagging scheme is used. Return True if so.

    Args:
        all_tags: a list of NER tags

    Returns:
        True if the tagging scheme does not use B-, I-, etc, otherwise False
       N)B-I-S-E-B_I_S_E_FTlenall_tagstag r   R/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/ner/utils.pyis_basic_scheme   s
   
r   c                 C   s:   | D ]}|t v r	qt|dkr|dd dv rq dS dS )z
    Check if BIO tagging scheme is used. Return True if so.

    Args:
        all_tags: a list of NER tags
    
    Returns:
        True if the tagging scheme is BIO, otherwise False
    r   N)r	   r
   r   r   FT)EMPTY_OR_O_TAGr   r   r   r   r   is_bio_scheme   s   
r   c                 C   s   g }t | D ]G\}}|tv r|| q|d dkrH|dks6| |d  dks6| |d  dd |dd krB|d|dd   q|| q|| q|S )a  
    Convert the original tag sequence to BIO2 format. If the input is already in BIO2 format,
    the original input is returned.

    Args:
        tags: a list of tags in either BIO or BIO2 format
    
    Returns:
        new_tags: a list of tags in BIO2 format
    r   I   r   NB	enumerater   appendtagsnew_tagsir   r   r   r   to_bio21   s   8r%   c                 C   st   g }t | D ]1\}}|tv r|| q|dks(| |d  dks(| |d  |kr0|d|  q|d|  q|S )a  
    Convert a basic tag sequence into a BIO sequence.
    You can compose this with bio2_to_bioes to convert to bioes

    Args:
        tags: a list of tags in basic (no B-, I-, etc) format

    Returns:
        new_tags: a list of tags in BIO format
    r   r   r   r	   r
   r   r!   r   r   r   basic_to_bioI   s   (r&   c                 C   s$  g }t | D ]\}}|tv r|| qt|dk r!td| |dd dv rU|d t| k rI| |d  dd dv rI|d|dd   q|d|dd   q|dd dv r|d t| k r}| |d  dd dv r}|d	|dd   q|d
|dd   qtd| |S )z
    Convert the BIO2 tag sequence into a BIOES sequence.

    Args:
        tags: a list of tags in BIO2 format

    Returns:
        new_tags: a list of tags in BIOES format
    r   zInvalid BIO2 tag found: N)r
   r   r   r
   r   )r	   r   r	   r   zInvalid IOB tag found: )r   r   r    r   	Exceptionr!   r   r   r   bio2_to_bioes_   s    
((r(   c                 C   s   dd | D }|S )z
    If any tags are None, _, -, or blank, turn them into EMPTY

    The input should be a list(sentence) of list(word) of tuple(text, list(tag))
    which is the typical format for the data at the time data.py is preprocessing the tags
    c                 S      g | ]	}d d |D qS )c                 S   s*   g | ]}|d  t dd |d D fqS )r   c                 s   s     | ]}|t v r
tn|V  qd S N)	EMPTY_TAGr   .0xr   r   r   	<genexpr>   s    z=normalize_empty_tags.<locals>.<listcomp>.<listcomp>.<genexpr>r   tupler-   wordr   r   r   
<listcomp>   s   * z3normalize_empty_tags.<locals>.<listcomp>.<listcomp>r   r-   sentencer   r   r   r4      s    z(normalize_empty_tags.<locals>.<listcomp>r   )	sentencesnew_sentencesr   r   r   normalize_empty_tags   s   r9   c                    sN  g }g }d}t | D ]C\}}t| \}}|| |s0tdd |D r0|dkr.td| d}|rHtdd |D sAtd| d	d
 |D }|| q
tdd |D t |D ]\}}tfdd|D rptd| q[g }	g }
tD ]G d}d} fdd
|D }t|}| ot	|}|r|
 dkrd}td   n|r|
 dkrd}td   |	| |
| qyg }t||D ]R\}}dd
 |D }t t|	|
D ]2\ \}} fdd
|D }|rtt|}n
t|}|rt|}t |D ]\}}|||  < q q|dd
 t||D  q|r%dd
 |D }|S )z
    Convert tags in these sentences to bioes

    We allow empty tags ('_', '-', None), which will represent tags
    that do not get any gradient when training
    Fc                 s   "    | ]}|d u pt |tV  qd S r*   
isinstancestrr-   r   r   r   r   r/           zprocess_tags.<locals>.<genexpr>r   zGGot a mix of tags and lists of tags.  First non-list was in sentence %dTc                 s   r:   r*   r;   r>   r   r   r   r/      r?   zLGot a mix of tags and lists of tags.  First tag as a list was in sentence %dc                 S   s   g | ]}|fqS r   r   r>   r   r   r   r4      s    z process_tags.<locals>.<listcomp>c                 s   s"    | ]}|D ]}t |V  qqd S r*   r   )r-   r"   r.   r   r   r   r/      r?   c                 3   s    | ]	}t | k V  qd S r*   r   r,   )max_columnsr   r   r/      s    zQNER tags not uniform in length at sentence %d.  TODO: extend those columns with Oc                    s   g | ]}|D ]}|  qqS r   r   )r-   sentr.   
column_idxr   r   r4      s    bioeszOBIO tagging scheme found in input at column %d; converting into BIOES scheme...zQBasic tagging scheme found in input at column %d; converting into BIOES scheme...c                 S   r)   )c                 S   s   g | ]}|qS r   r   r,   r   r   r   r4      s    +process_tags.<locals>.<listcomp>.<listcomp>r   )r-   sentence_tagsr   r   r   r4          c                    s   g | ]}|  qS r   r   r,   rB   r   r   r4      s    c                 S   s   g | ]
\}}|t |fqS r   r0   )r-   wtr   r   r   r4      s    c                 S   r)   )c                 S   s    g | ]}|d  |d d  fqS )r   r   r   r2   r   r   r   r4      s     rE   r   r5   r   r   r   r4      rG   )r   zipr    any
ValueErrorallmaxranger   r   lowerloggerdebugr(   r&   r%   )r7   scheme	all_wordsr   converted_tuplessent_idxrA   wordsr"   all_convert_bio_to_bioesall_convert_basic_to_bioesconvert_bio_to_bioesconvert_basic_to_bioes
tag_columnis_biois_basicresulttag_idxr   r   )rC   r@   r   process_tags   sj   

ra   c                    s   g g d  fdd}t | D ]c\}}|du rd}|dkr&|  g q|dr8|  |g|dd  q|drI| |dd  q|dr_| |dd  |  g q|d	ru|  |g|dd  |  g q|  S )
z
    Decode from a sequence of BIOES tags, assuming default tag is 'O'.
    Args:
        tags: a list of BIOES tags
    
    Returns:
        A list of dict with start_idx, end_idx, and type values.
    Nc                      s.   t dkrd d  d d S d S )Nr   )startendtype)r   r    r   cur_typeent_idxsresr   r   flush   s   z decode_from_bioes.<locals>.flushr   r	   r   r
   r   r   )r   
startswithr    )r"   rj   idxr   r   rf   r   decode_from_bioes   s>   	





rm   c                  G   s  t | d }| dd D ]}d}|t|k r|| dkr!|d7 }q|| dr9|| dkr4|| ||< |d7 }q|| dsHtd|||}|d }|t|k r|| dd || dd krltd	|||| d
rtn|| dstd	|||d7 }|t|k sT|t|krtd||d }tdd ||| D r||| |||< |}|t|k sq|S )z~
    Merge multiple sequences of NER tags into one sequence

    Only O is replaced, and the earlier tags have precedence
    r   r   Nr   r   r	   z)Got unexpected tag sequence at idx {}: {}r   z%Unexpected tag sequence at idx {}: {}r   r
   z'Got a sequence with an unclosed tag: {}c                 s   s    | ]}|d kV  qdS )r   Nr   r,   r   r   r   r/   6  s    zmerge_tags.<locals>.<genexpr>)listr   rk   rL   formatrM   )	sequencesr"   sequencerl   	start_idxend_idxr   r   r   
merge_tags  sD    &rt   )__doc__loggingstanza.models.common.vocabr   	getLoggerrQ   r+   r1   rn   r   r   r   r%   r&   r(   r9   ra   rm   rt   r   r   r   r   <module>   s    
 P1