o
    h!                     @   s`  d dl Z d dlZd dlZd dlZd dlZd dlZejjejjgZ	d dl
mZ d dlmZ d dlmZ d dlmZ edZd dd	Zd
 dd	Zd dd	Zd dd	Zd dd	Zejdddd Zdd Zdd Zdd Z edddZ!dd Z"dd Z#dd  Z$d!d" Z%d#d$ Z&d%d& Z'd'd( Z(d)d* Z)d+d, Z*d-d. Z+d/d0 Z,dS )1    N)
ner_tagger)TrainerTEST_WORKING_DIR)process_datasetstanzazr
Chris B-PERSON
Manning E-PERSON
is O
a O
good O
man O
. O

He O
works O
in O
Stanford B-ORG
University E-ORG
. O
 	zO
Chris B-PERSON
Manning E-PERSON
is O
part O
of O
Computer B-ORG
Science E-ORG
z
Chris B-PERSON B-PER
Manning E-PERSON E-PER
is O O
a O O
good O O
man O O
. O O

He O O
works O O
in O O
Stanford B-ORG B-ORG
University E-ORG B-ORG
. O O
z
Chris B-PERSON -
Manning E-PERSON -
is O -
a O -
good O -
man O -
. O -

He O -
works O -
in O -
Stanford B-ORG -
University E-ORG -
. O -
zm
Chris B-PERSON B-PER
Manning E-PERSON E-PER
is O O
part O O
of O O
Computer B-ORG B-ORG
Science E-ORG E-ORG
module)scopec                   C   s
   t  dS )Nz/in/tiny_emb.ptr    r   r   ]/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/tests/ner/test_ner_training.pypretrain_fileY   s   
r   c                 C   sZ   t j| d d }t|ddd}|| W d    n1 s!w   Y  t||  d S )Nr   z.biowutf-8encoding)ospathsplitextopenwriter   )filenamebio_databio_filenamefoutr   r   r   write_temp_file]   s
   r   c           	      C   s   g }| d}|D ]$}|g  | dD ]}|j ddd\}}|d ||  d qq	t| dd	d
}t|| W d    d S 1 sGw   Y  d S )Nz


r	      )maxsplit)text	multi_nerr   r   r   )splitappendr   jsondump)	r   r   doc	sentencessentencewordr!   tagsr   r   r   r   write_temp_2tagc   s   


"r,   c                 G   sL   | d }dt | d|dt |dt |dddd	d
ddt |g}|t| }|S )Nmodelsz
--data_dirz--wordvec_pretrain_filez--train_filez--eval_filez--shorthanden_testz--max_steps100z--eval_interval40z
--save_dir)strlist)tmp_pathr   
train_jsondev_json
extra_argssave_dirargsr   r   r   get_argsr   s   	r9   
train_datac                G   sD   |d }t || |d }t |t t|| ||g|R  }t|S Nzen_test.train.jsonzen_test.dev.json)r,   EN_DEV_2TAGr9   r   main)r   r3   r;   r6   r4   r5   r8   r   r   r   run_two_tag_training      


r?   c                 C   sP   t | |}t|jjdksJ t|jjdksJ t|jd  dks&J d S )N   tag)r?   lenmodeltag_clfscritsvocablensr   r3   trainerr   r   r   test_basic_two_tag_training   s   
rK   c                 C   s   t | |}|tj|jd |jd  t | |d}t|jjdks%J t|jjdks/J t	|jj|jjD ]\}}t
|j|jrFJ q8dS )z
    Test that the training is backproping both tags

    We can do this by using the "finetune" mechanism and verifying
    that the output tensors are different
    r7   	save_name
--finetunerA   N)r?   saver   r   joinr8   rC   rD   rE   ziptorchallcloseweight)r   r3   rJ   new_trainerold_clfnew_clfr   r   r   test_two_tag_training_backprop   s   
 rW   c                 C   s   t | |}|tj|jd |jd  t | |dtd}t|jj	dks'J t|jj	dks1J t
|jj	d j|jj	d jrCJ t
|jj	d j|jj	d jsUJ dS )	z
    Test that the training is backproping only one tag if one column is blank

    We can do this by using the "finetune" mechanism and verifying
    that the output tensors are different in just the first column
    r7   rL   rM   r:   rA   r   r   N)r?   rN   r   r   rO   r8   EN_TRAIN_2TAG_EMPTY2rC   rD   rE   rQ   rR   rS   )r   r3   rJ   rT   r   r   r   !test_two_tag_training_c2_backprop   s   
 $(rY   c                 C   s   t | |d}t|jjdksJ t|jjdksJ t|jd  dks'J |jjd jjd |jd  d |jjd jjd  ksFJ d S )Nz--connect_output_layersrA   rB   r   r   )	r?   rC   rD   rE   rF   rG   rH   rS   shaperI   r   r   r   test_connected_two_tag_training   s
   Br[   c                 G   sD   |d }t |t |d }t |t t|| ||g|R  }t|S r<   )r   EN_TRAIN_BIO
EN_DEV_BIOr9   r   r>   )r   r3   r6   r4   r5   r8   r   r   r   run_training   r@   r^   c                 C   sL   t | |}tj std dS |j}t| j	}t
|ds$J dS )d
    Briefly train an NER model (no expectation of correctness) and check that it is on the GPU
    zICannot check that the NER model is on the GPU, since GPU is not availableNcuda)r^   rQ   r`   is_availablewarningswarnrD   next
parametersdevicer1   
startswithr   r3   rJ   rD   rf   r   r   r   test_train_model_gpu   s   


ri   c                 C   s6   t | |d}|j}t| j}t|dsJ dS )r_   z--cpucpuN)r^   rD   rd   re   rf   r1   rg   rh   r   r   r   test_train_model_cpu   s   rk   c                 C   s.   t j| dd dd}tdd |d  D S )Nc                 S      | S Nr   storagelocr   r   r   <lambda>       z%model_file_has_bert.<locals>.<lambda>Tweights_onlyc                 s       | ]}| d V  qdS zbert_model.Nrg   .0xr   r   r   	<genexpr>       z&model_file_has_bert.<locals>.<genexpr>rD   )rQ   loadanykeys)r   
checkpointr   r   r   model_file_has_bert   s   r   c                 C   s8   t | |dd}tj|jd |jd }t|rJ d S )N--bert_modelhf-internal-testing/tiny-bertr7   rL   )r^   r   r   rO   r8   r   )r   r3   rJ   
model_filer   r   r   test_with_bert   s   r   c                 C   s   t | |ddd}tj|jd |jd }t|sJ tj|d|jd  }tj|d|jd  }|| t|s>J t|j|d}|| t|sPJ d S )	Nr   r   z--bert_finetuner7   rL   foo_bar_r8   r   )r^   r   r   rO   r8   r   rN   r   )r   r3   rJ   r   foo_save_filenamebar_save_filenamereloaded_trainerr   r   r   test_with_bert_finetune   s   

r   c                 C   sz   t | |ddd}tj|jd |jd }tj|dd dd	}d
|v s%J tdd |d  D r4J t	|j|d}d S )Nr   r   z
--use_peftr7   rL   c                 S   rl   rm   r   rn   r   r   r   rq      rr   z)test_with_peft_finetune.<locals>.<lambda>Trs   	bert_lorac                 s   ru   rv   rw   rx   r   r   r   r{     r|   z*test_with_peft_finetune.<locals>.<genexpr>rD   r   )
r^   r   r   rO   r8   rQ   r}   r~   r   r   )r   r3   rJ   r   r   r   r   r   r   test_with_peft_finetune   s   r   )-r%   loggingr   rb   pytestrQ   marktravispipeline
pytestmarkstanza.modelsr   stanza.models.ner.trainerr   stanza.testsr   *stanza.utils.datasets.ner.prepare_ner_filer   	getLoggerloggerlstripreplacer\   r]   stripEN_TRAIN_2TAGrX   r=   fixturer   r   r,   r9   r?   rK   rW   rY   r[   r^   ri   rk   r   r   r   r   r   r   r   r   <module>   sX    






