o
    h'                     @   s   d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZ e Zdd Zd	d
 Zedkr>e  dS dS )a  
A short script to use a Stanza tokenizer to extract tokenized sentences from Wikipedia

The first step is to convert a Wikipedia dataset using Prof. Attardi's wikiextractor:
https://github.com/attardi/wikiextractor

This script then writes out sentences, one per line, whitespace separated
Some common issues with the tokenizer are accounted for by discarding those lines.

Also, to account for languages such as VI where whitespace occurs within words,
spaces are replaced with _  This should not cause any confusion, as any line with
a natural _ in has already been discarded.

for i in `echo A B C D E F G H I J K`; do nlprun "python3 stanza/utils/datasets/constituency/tokenize_wiki.py --output_file /u/nlp/data/constituency-parser/italian/2024_wiki_tokenization/it_wiki_tokenized_B$i.txt --lang it --max_len 120 --input_dir /u/nlp/data/Wikipedia/itwiki/B$i --tokenizer_model saved_models/tokenize/it_combined_tokenizer.pt --download_method None" -o /u/nlp/data/constituency-parser/italian/2024_wiki_tokenization/it_wiki_tokenized_B$i.out; done
    N)load_tokenizerfilter_data)selftrain_wiki)add_length_argstokenize_docs)get_tqdmc                  C   s   t jdd} | jdddd | jddd	d | jd
d}|jdd dd |jdd dd | jdd dd | jdd dd | jdd dd t|  |  }|S )NzFScript that converts part of a wikipedia dump to silver standard trees)descriptionz--output_filezvi_wiki_tokenized.txtz"Where to write the tokenized lines)defaulthelpz--langviz4Which language tools to use for tokenization and POST)requiredz--input_dirz<Path to the wikipedia dump after processing by wikiextractorz--input_filezMPath to a single file of the wikipedia dump after processing by wikiextractorz--bert_tokenizerz=Which bert tokenizer (if any) to use to filter long sentencesz--tokenizer_modelzHUse this model instead of the current Stanza tokenizer for this languagez--download_methodzTDownload pipeline models using this method (defaults to downloading updates from HF))argparseArgumentParseradd_argumentadd_mutually_exclusive_groupr   
parse_args)parserinput_groupargs r   k/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/datasets/constituency/tokenize_wiki.pyr      sT   r   c                  C   sB  t  } | jd urt| j}n| jd ur| jg}ntd| jr,t| j}td|j	  i }| j
r6| j
|d< | jr>| j|d< tj| jfddi|}t| jddd	E}t|D ]7}t|}t||| j| j}| jrt| jd
d |D |tj}	dd |	D }|D ]}
||
 |d qqWW d    d S 1 sw   Y  d S )Nz/Need to specify at least one file or directory!zMax model length: %dtokenize_model_pathdownload_method
processorstokenizewzutf-8)encodingc                 S   s   g | ]}|  qS r   )split.0xr   r   r   
<listcomp>a   s    zmain.<locals>.<listcomp>c                 S   s   g | ]}d  |qS ) )joinr   r   r   r   r!   b   s    
)r   	input_dirr   list_wikipedia_files
input_file
ValueErrorbert_tokenizerr   printmodel_max_lengthtokenizer_modelr   stanzaPipelinelangopenoutput_filetqdmread_wiki_filer   min_lenmax_lenr   loggingDEBUGwrite)r   files	tokenizerpipeline_argspipefoutfilenamedocstextfilteredliner   r   r   mainI   s8   







"rC   __main__)__doc__r   r6   r-   #stanza.models.common.bert_embeddingr   r   "stanza.utils.datasets.constituencyr   ,stanza.utils.datasets.constituency.selftrainr   r   stanza.utils.get_tqdmr   r2   r   rC   __name__r   r   r   r   <module>   s    -
