o
    h                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 dd Zd	d
 Zdd Zedkr?e  dS dS )aa  
This script downloads and extracts the text from an Oscar crawl on HuggingFace

To use, just run

dump_oscar.py <lang>

It will download the dataset and output all of the text to the --output directory.
Files will be broken into pieces to avoid having one giant file.
By default, files will also be compressed with xz (although this can be turned off)
    N)tqdm)get_dataset_split_names)load_dataset)lang_to_langcodec                  C   sx   t  } | jddd | jdddd | jdd	d
ddd | jdddd | jdddgddd |  }t|j|_|S )z
    A few specific arguments for the dump program

    Uses lang_to_langcode to process args.language, hopefully converting
    a variety of possible formats to the short code used by HuggingFace
    languagezLanguage to download)helpz--output
oscar_dumpzPath for saving files)defaultr   z--no_xzxzTstore_falsez9Don't xz the files - default is to compress while writing)destr	   actionr   z--prefixz+Prefix to use for the pieces of the datasetz	--version20192023z.Which version of the Oscar dataset to download)choicesr	   r   )argparseArgumentParseradd_argument
parse_argsr   r   )parserargs r   Y/var/www/html/env_mimamsha/lib/python3.10/site-packages/stanza/utils/charlm/dump_oscar.pyr      s   r   c                 C   s   t dd}t| }d S )Noscar-corpus/OSCAR-2301sd)r   listkeys)r   datasetsplit_namesr   r   r   download_2023+   s   
r   c               
      s  t    j}  jdkrKd|  }ztd|}W n ty) } ztd|  |d }~ww t|dkr7td|td|}||d  }|jj	}dd	 }n3 jd
krwtd| }t
| }t|dkrhtd|||d  }|jj	}dd	 }ntd j td|d }tdtt|d } jrd j|f  fdd	}	nd j|f  fdd	}	td j  td| tj jdd d}
d}d}|	|
}t|D ]-}||}|| |d |t|7 }|d7 }|dkrd}|  |
d }
|	|
}q|  d S )Nr   zunshuffled_deduplicated_%soscarz.Language %s not available in HuggingFace Oscar   zUnexpected split_names: {}r   c                 S      | d S Ntextr   xr   r   r   <lambda>C       zmain.<locals>.<lambda>r   r   c                 S   r"   r#   r   r%   r   r   r   r'   R   r(   zUnknown version: %sg      ?g    חA   z%s_%%0%dd.txt.xzc                    s   t tj j|  dS )Nwt)lzmaopenospathjoinoutputfile_idxr   
format_strr   r   r'   [   s    z%s_%%0%dd.txtc                    s   t tj j|  dS )Nw)r,   r-   r.   r/   r0   r1   r3   r   r   r'   ^   s    zWriting dataset to %szDataset length: {}T)exist_ok
)r   r   versionr   
ValueErrorlenformatr   infosize_in_bytesr   r   AssertionErrormaxmathfloorlog10r
   prefixprintr0   r-   makedirsr   writeclose)r   dataset_namer   er   r=   process_itemchunksid_lenfopenr2   file_len	total_lenfoutitemr$   r   r3   r   main0   sh   








rR   __main__)__doc__r   r+   r@   r-   r   datasetsr   r   stanza.models.common.constantr   r   r   rR   __name__r   r   r   r   <module>   s    G
