o
    h'                     @   s
  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ zd dlmZ W n e	y8   d dl
m  mZ Y nw dZdZdZdZdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd(d!d"Zd#d$ Zd%d& Zed'kre  dS dS ))    N)tqdm)etreezhttp://www.tei-c.org/ns/1.0zann_morphosyntax.xmlzann_named.xmlzann_segmentation.xmlc                 C   s&   t j| sd S t| }| }|S N)ospathexistsr   parsegetroot)r   etrt r   a/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/ner/convert_nkjp.py	parse_xml   s
   
r   c                 C   s
   |  dS )Nz({http://www.w3.org/XML/1998/namespace}idget)noder   r   r   get_node_id   s   
r   c                 C   s   t | |}t| ||}|S r   )%extract_unassigned_subfolder_entitiesassign_entities)	subfoldernkjp_dirsubfolder_entitiespar_id_to_segsr   r   r   extract_entities_from_subfolder#   s   
r   c                 C   s   t j|| t}t|}|du rdS i }|dttttf }|D ]/}i }t|d\}}	|dt }
|
D ]}|	d}|d\}}t
|||< q7|||	< q!|S )zB
    Build and return a map from par_id to extracted entities
    N{%s}TEI/{%s}text/{%s}body/{%s}p_{%s}scorrespz#morph_)r   r   joinNER_FILEr   findall	NAMESPACEr   splitr   extract_entities_from_sentence)r   r   ner_pathr   r   ner_parsparpar_entitiesr   par_id	ner_sentsner_sentr   ner_sent_idr   r   r   r   *   s    

r   c                 C   s   |  dt }i }t|D ]S\}}t|}dd | dt D }| dtttf d j}| dtttf d d}| d	tttf }	|	rQ|	d d}
nd }
|||||
|d
}|||< qt|}|S )Nz	./{%s}segc                 S   s   g | ]}| d qS )targetr   ).0ptrr   r   r   
<listcomp>F   s    z2extract_entities_from_sentence.<locals>.<listcomp>z	./{%s}ptrz'./{%s}fs/{%s}f[@name='orth']/{%s}stringr   z'./{%s}fs/{%s}f[@name='type']/{%s}symbolvaluez*./{%s}fs/{%s}f[@name='subtype']/{%s}symbol)ent_idindexorthner_typener_subtypetargets)r    r!   	enumerater   textr   clear_entities)r*   segssent_entitiesisegr1   r6   r3   r4   ner_subtype_noder5   entitycleared_entitiesr   r   r   r#   ?   s*   
r#   c                 C   sP   t |  tt|  dd d}t|} | D ]}|d }dd |D |d< q| S )Nc                 S      | d S )Nr2   r   )entr   r   r   <lambda>\       z clear_entities.<locals>.<lambda>keyr6   c                 S   s   g | ]	}| d d qS )morph_   )r"   )r-   tr   r   r   r/   `   s    z"clear_entities.<locals>.<listcomp>)resolve_entitiessortedlistvalueseliminate_overlapping_entities)entitiesentities_listr?   r6   r   r   r   r9   Y   s   r9   c                    s4    fdd   D } D ]
}||  | d< qd S )Nc                    s   i | ]
\}}|t | qS r   )resolve_entity)r-   	entity_idr?   rO   r   r   
<dictcomp>f       z$resolve_entities.<locals>.<dictcomp>r6   )items)rO   resolved_targetsrR   r   rS   r   rJ   d   s   rJ   c                 C   sH   | d }g }|D ]}| dr|| }|t|| q|| q|S )Nr6   named_)
startswithextendrQ   append)r?   rO   r6   resolvedr,   target_entityr   r   r   rQ   k   s   
rQ   c                    sf   t g t| D ]!\}}| d | D ] t fdd|d D r(|d  qqfdd| D S )Nc                    s   g | ]}| d  v qS )r6   r   r-   r,   )overr   r   r/      s    z2eliminate_overlapping_entities.<locals>.<listcomp>r6   r1   c                    s   g | ]
}|d   vr|qS )r1   r   )r-   r?   )subsumedr   r   r/      rU   )setr7   anyadd)rP   sub_isubr   )r_   r`   r   rN   y   s   rN   c                    s  t j|| t}t|}|dttttf }i  |D ]`}t|d\}}|dt }	i }
|	D ]E}t|d\}}|dt }i }t	|D ](\}}t|d\}}|dtttf d j
}||||ddd d	}|||< qI||
|< q1|
 |< q|d u rd S |D ]G| }|D ]>| }|D ]5}|d
 }d}|d }t fdd|D dd d}|D ]}| d| }||d< |d |d< d}qqqq S )Nr   r   r   z{%s}segrG   z%{%s}fs/{%s}f[@name='orth']/{%s}stringr   O)seg_idr<   r3   r8   tagnerr5   r6   Br4   c                    s   g | ]
}   | qS r   r   r^   r   par_keysent_keyr   r   r/      rU   z#assign_entities.<locals>.<listcomp>c                 S   rA   Nr<   r   )xr   r   r   rC      rD   z!assign_entities.<locals>.<lambda>rE   -ri   r5   I)r   r   r   
MORPH_FILEr   r    r!   r   r"   r7   r8   rK   )r   r   r   
morph_pathr   
morph_parsr&   r   r(   morph_sentssent_id_to_segs
morph_sentsent_idr:   	sent_segsr<   r=   rg   r3   tokenpar_entsr;   r?   r6   iob	ner_labelmatching_tokens
full_labelr   rk   r   r      s\   


"r   c                    sT   i }t t }t fdd|D D ]}t| }|r"|||< qt|d q|S )Nc                    s&   g | ]}t jt j |r|qS r   )r   r   isdirr   )r-   namer   r   r   r/      s   & z!load_xml_nkjp.<locals>.<listcomp>zhas no ann_named.xml file)rK   r   listdirr   r   print)r   subfolder_to_annotations
subfoldersr   outr   r   r   load_xml_nkjp   s   

r   T?皙?c           
      C   s|   t d |rt |  |sd| }t|t|  }t|t|  }| d | }| |||  }| || d  }	|||	dS )Nih:rH   )traindevtest)randomseedshuffleintlen)
datasetr   train_fractiondev_fractiontest_section
train_sizedev_sizer   r   r   r   r   r   split_dataset   s   

r   c              	   C   s  t d|   tj| rR| ds| drRt /}t d| |f  t| d}|	| W d   n1 s9w   Y  t
|}W d   n1 sLw   Y  ntj| r]t
| }ntdg }| D ]J\}}| D ]A\}}	| d| }
g }|	 D ]$\}}| }t|d	d
 d}|D ]}|d}|d}|| qq|
|d d< || qoqgt|}| D ]/\}}|rttj|d| dddd}tj||ddd W d   n1 sw   Y  qdS )zvConverts NKJP NER data into IOB json format.

    nkjp_dir is the path to directory where NKJP files are located.
    zReading data from %sz.tar.gzz.tgzzTemporarily extracting %s to %szr:gzNz3Cannot find either unpacked dataset or gzipped file|c                 S   rA   rn   r   )tokr   r   r   rC      rD   zconvert_nkjp.<locals>.<lambda>rE   r<   rg   r   paragraph_idzpl_nkjp.z.jsonwzutf-8)encodingF   )ensure_asciiindent)r   r   r   isfileendswithtempfileTemporaryDirectorytarfileopen
extractallr   r   FileNotFoundErrorrV   rM   rK   popr[   r   r   jsondump)	nkjp_path
output_dirr   tar_insubfolder_to_entities	convertedsubfolder_nameparsr(   r&   paragraph_identifier
par_tokensr   senttokenssrtrz   r"   
split_namefr   r   r   convert_nkjp   sJ    




"r   c                  C   sF   t  } | jdtddd | jdtddd |  }t|j|j d S )Nz--input_pathz@/u/nlp/data/ner/stanza/polish/NKJP-PodkorpusMilionowy-1.2.tar.gzzWhere to find the files)typedefaulthelpz--output_pathzdata/nerzWhere to output the results)argparseArgumentParseradd_argumentstr
parse_argsr   
input_pathoutput_path)parserargsr   r   r   main   s
   r   __main__)Tr   r   T)r   r   r   r   r   r   r   lxmlr   ImportErrorxml.etree.ElementTreeElementTreer!   rr   r   SEGMENTATION_FILEr   r   r   r   r#   r9   rJ   rQ   rN   r   r   r   r   r   __name__r   r   r   r   <module>   sB    1
)	
