""" Basic testing of part of speech tagging """ import pytest import stanza from stanza.tests import * pytestmark = pytest.mark.pipeline EN_DOC = "Joe Smith was born in California." EN_DOC_GOLD = """ ]> ]> ]> ]> ]> ]> ]> """.strip() @pytest.fixture(scope="module") def pos_pipeline(): return stanza.Pipeline(**{'processors': 'tokenize,pos', 'dir': TEST_MODELS_DIR, 'download_method': None, 'lang': 'en'}) def test_part_of_speech(pos_pipeline): doc = pos_pipeline(EN_DOC) assert EN_DOC_GOLD == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) def test_get_known_xpos(pos_pipeline): tags = pos_pipeline.processors['pos'].get_known_xpos() # make sure we have xpos... assert 'DT' in tags # ... and not upos assert 'DET' not in tags def test_get_known_upos(pos_pipeline): tags = pos_pipeline.processors['pos'].get_known_upos() # make sure we have upos... assert 'DET' in tags # ... and not xpos assert 'DT' not in tags def test_get_known_feats(pos_pipeline): feats = pos_pipeline.processors['pos'].get_known_feats() # I appreciate how self-referential the Abbr feat is assert 'Abbr' in feats assert 'Yes' in feats['Abbr']