o
    h@                     @   sF   d Z ddlZddlZddlZddlmZ dd Zedkr!e  dS dS )a  
A utility script to load a word embedding file from a text file and save it as a .pt

Run it as follows:
  python stanza/models/common/convert_pretrain.py <.pt file> <text file> <# vectors>

Note that -1 for # of vectors will keep all the vectors.
You probably want to keep fewer than that for most publicly released
embeddings, though, as they can get quite large.

As a concrete example, you can convert a newly downloaded Faroese WV file as follows:
  python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/fo_farpahc.pretrain.pt ~/extern_data/wordvec/fasttext/faroese.txt -1
or save part of an Icelandic WV file:
  python3 stanza/models/common/convert_pretrain.py ~/stanza/saved_models/pos/is_icepahc.pretrain.pt ~/extern_data/wordvec/fasttext/icelandic.cc.is.300.vec 150000
Note that if the pretrain already exists, nothing will be changed.  It will not overwrite an existing .pt file.

    N)pretrainc                  C   s   t  } | jdd dd | jdd dd | jdtddd	d
 |  }tj|jr0t	d|j  |j
drBtj|j|j|j
d}ntj|j|j
|jd}t	dt|j d S )N	output_ptz$Where to write the converted PT file)defaulthelp	input_veczUnconverted vectors file	max_vocab?z7How many vectors to convert.  -1 means convert them all)typer   nargsr   z,Not overwriting existing pretrain file in %sz.csv)r   csv_filename)r   zPretrain is of size {})argparseArgumentParseradd_argumentint
parse_argsospathexistsr   printr   endswithr   Pretrainr   formatlenvocab)parserargspt r   `/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/models/common/convert_pretrain.pymain   s   r    __main__)__doc__r   r   sysstanza.models.commonr   r    __name__r   r   r   r   <module>   s    
