""" Basic testing of tokenization """ import pytest import stanza from stanza.tests import * pytestmark = pytest.mark.pipeline EN_DOC = "Joe Smith lives in California. Joe's favorite food is pizza. He enjoys going to the beach." EN_DOC_WITH_EXTRA_WHITESPACE = "Joe Smith \n lives in\n California. Joe's favorite food \tis pizza. \t\t\tHe enjoys \t\tgoing to the beach." EN_DOC_GOLD_TOKENS = """ ]> ]> ]> ]> ]> ]> , ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() # spaCy doesn't have MWT EN_DOC_SPACY_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() EN_DOC_POSTPROCESSOR_TOKENS_LIST = [['Joe', 'Smith', 'lives', 'in', 'California', '.'], [("Joe's", True), 'favorite', 'food', 'is', 'pizza', '.'], ['He', 'enjoys', 'going', 'to', 'the', 'beach', '.']] EN_DOC_POSTPROCESSOR_COMBINED_LIST = [['Joe', 'Smith', 'lives', 'in', 'California', '.'], ['Joe', "'s", 'favorite', 'food', 'is', 'pizza', '.'], ['He', 'enjoys', 'going', "to the beach", '.']] EN_DOC_POSTPROCESSOR_COMBINED_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """ # ensure that the entry above has spaces somewhere to test that spaces work in between tokens EN_DOC_GOLD_NOSSPLIT_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() EN_DOC_PRETOKENIZED = \ "Joe Smith lives in California .\nJoe's favorite food is pizza .\n\nHe enjoys going to the beach.\n" EN_DOC_PRETOKENIZED_GOLD_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() EN_DOC_PRETOKENIZED_LIST = [['Joe', 'Smith', 'lives', 'in', 'California', '.'], ['He', 'loves', 'pizza', '.']] EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() EN_DOC_NO_SSPLIT = ["This is a sentence. This is another.", "This is a third."] EN_DOC_NO_SSPLIT_SENTENCES = [['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', '.'], ['This', 'is', 'a', 'third', '.']] FR_DOC = "Le prince va manger du poulet aux les magasins aujourd'hui." FR_DOC_POSTPROCESSOR_TOKENS_LIST = [['Le', 'prince', 'va', 'manger', ('du', True), 'poulet', ('aux', True), 'les', 'magasins', "aujourd'hui", '.']] FR_DOC_POSTPROCESSOR_COMBINED_MWT_LIST = [['Le', 'prince', 'va', 'manger', ('du', True), 'poulet', ('aux', True), 'les', 'magasins', ("aujourd'hui", ["aujourd'", "hui"]), '.']] FR_DOC_PRETOKENIZED_LIST_GOLD_TOKENS = """ ]> ]> ]> ]> , ]> ]> , ]> ]> ]> , ]> ]> """ JA_DOC = "北京は中国の首都です。北京の人口は2152万人です。\n" # add some random whitespaces that need to be skipped JA_DOC_GOLD_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() JA_DOC_GOLD_NOSSPLIT_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() ZH_DOC = "北京是中国的首都。北京有2100万人口，是一个直辖市。\n" ZH_DOC1 = "北\n京是中\n国的首\n都。北京有2100万人口，是一个直辖市。\n" ZH_DOC2 = "北\n京是中\n国的首\n都。\n\n 北京有2100万人口，是一个直辖市。\n" ZH_DOC_GOLD_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() ZH_DOC1_GOLD_TOKENS=""" ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() ZH_DOC_GOLD_NOSSPLIT_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() ZH_PARENS_DOC = "我们一起学(猫叫)" TH_DOC = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ และเขาได้รับมอบหมายให้ประจำในระดับภูมิภาค" TH_DOC_GOLD_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() TH_DOC_GOLD_NOSSPLIT_TOKENS = """ ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> ]> """.strip() @pytest.fixture(scope="module") def basic_pipeline(): """ Create a pipeline with a basic English tokenizer """ nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en') return nlp @pytest.fixture(scope="module") def pretokenized_pipeline(): """ Create a pipeline with a basic English pretokenized tokenizer """ nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_pretokenized': True}) return nlp @pytest.fixture(scope="module") def zh_pipeline(): """ Create a pipeline with a basic Chinese tokenizer """ nlp = stanza.Pipeline(lang='zh', processors='tokenize', dir=TEST_MODELS_DIR) return nlp def test_tokenize(basic_pipeline): doc = basic_pipeline(EN_DOC) assert EN_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_tokenize_ssplit_robustness(basic_pipeline): doc = basic_pipeline(EN_DOC_WITH_EXTRA_WHITESPACE) assert EN_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_pretokenized(pretokenized_pipeline): doc = pretokenized_pipeline(EN_DOC_PRETOKENIZED) assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) doc = pretokenized_pipeline(EN_DOC_PRETOKENIZED_LIST) assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_pretokenized_multidoc(pretokenized_pipeline): doc = pretokenized_pipeline(EN_DOC_PRETOKENIZED) assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) doc = pretokenized_pipeline([stanza.Document([], text=EN_DOC_PRETOKENIZED_LIST)])[0] assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_postprocessor(): def dummy_postprocessor(in_doc): # Importantly, EN_DOC_POSTPROCESSOR_COMBINED_LIST returns a few tokens joinde # with space. As some languages (such as VN) contains tokens with space in between # its important to have joined space tested as one of the tokens assert in_doc == EN_DOC_POSTPROCESSOR_TOKENS_LIST return EN_DOC_POSTPROCESSOR_COMBINED_LIST nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_postprocessor': dummy_postprocessor}) doc = nlp(EN_DOC) assert EN_DOC_POSTPROCESSOR_COMBINED_TOKENS.strip() == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]).strip() def test_postprocessor_mwt(): def dummy_postprocessor(input): # Importantly, EN_DOC_POSTPROCESSOR_COMBINED_LIST returns a few tokens joinde # with space. As some languages (such as VN) contains tokens with space in between # its important to have joined space tested as one of the tokens assert input == FR_DOC_POSTPROCESSOR_TOKENS_LIST return FR_DOC_POSTPROCESSOR_COMBINED_MWT_LIST nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'fr', 'tokenize_postprocessor': dummy_postprocessor}) doc = nlp(FR_DOC) assert FR_DOC_PRETOKENIZED_LIST_GOLD_TOKENS.strip() == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]).strip() def test_postprocessor_typeerror(): with pytest.raises(ValueError): nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_postprocessor': "iamachicken"}) def test_no_ssplit(): nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_no_ssplit': True}) doc = nlp(EN_DOC_NO_SSPLIT) assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences] assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_zh_tokenizer_skip_newline(zh_pipeline): doc = zh_pipeline(ZH_DOC1) assert ZH_DOC1_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char].replace('\n', '') == token.text for sent in doc.sentences for token in sent.tokens]) def test_zh_tokenizer_skip_newline_offsets(zh_pipeline): doc = zh_pipeline(ZH_DOC2) assert ZH_DOC1_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char].replace('\n', '') == token.text for sent in doc.sentences for token in sent.tokens]) def test_zh_tokenizer_parens(zh_pipeline): """ The original fix for newlines in Chinese text broke () in Chinese text """ doc = zh_pipeline(ZH_PARENS_DOC) # ... the results are kind of bad for this expression, so no testing of the results yet #assert ZH_PARENS_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) def test_spacy(): nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en', tokenize_with_spacy=True, download_method=None) doc = nlp(EN_DOC) # make sure the loaded tokenizer is actually spacy assert "SpacyTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert EN_DOC_SPACY_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_spacy_no_ssplit(): nlp = stanza.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en', tokenize_with_spacy=True, tokenize_no_ssplit=True, download_method=None) doc = nlp(EN_DOC) # make sure the loaded tokenizer is actually spacy assert "SpacyTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert EN_DOC_GOLD_NOSSPLIT_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_sudachipy(): nlp = stanza.Pipeline(lang='ja', dir=TEST_MODELS_DIR, processors={'tokenize': 'sudachipy'}, package=None, download_method=None) doc = nlp(JA_DOC) assert "SudachiPyTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert JA_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_sudachipy_no_ssplit(): nlp = stanza.Pipeline(lang='ja', dir=TEST_MODELS_DIR, processors={'tokenize': 'sudachipy'}, tokenize_no_ssplit=True, package=None, download_method=None) doc = nlp(JA_DOC) assert "SudachiPyTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert JA_DOC_GOLD_NOSSPLIT_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_jieba(): nlp = stanza.Pipeline(lang='zh', dir=TEST_MODELS_DIR, processors={'tokenize': 'jieba'}, package=None, download_method=None) doc = nlp(ZH_DOC) assert "JiebaTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert ZH_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_jieba_no_ssplit(): nlp = stanza.Pipeline(lang='zh', dir=TEST_MODELS_DIR, processors={'tokenize': 'jieba'}, tokenize_no_ssplit=True, package=None, download_method=None) doc = nlp(ZH_DOC) assert "JiebaTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert ZH_DOC_GOLD_NOSSPLIT_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_pythainlp(): nlp = stanza.Pipeline(lang='th', dir=TEST_MODELS_DIR, processors={'tokenize': 'pythainlp'}, package=None, download_method=None) doc = nlp(TH_DOC) assert "PyThaiNLPTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert TH_DOC_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens]) def test_pythainlp_no_ssplit(): nlp = stanza.Pipeline(lang='th', dir=TEST_MODELS_DIR, processors={'tokenize': 'pythainlp'}, tokenize_no_ssplit=True, package=None, download_method=None) doc = nlp(TH_DOC) assert "PyThaiNLPTokenizer" == nlp.processors['tokenize']._variant.__class__.__name__ assert TH_DOC_GOLD_NOSSPLIT_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])