'PoS tagger model for a specific domain

I am trying to build a tagger model in spaCy v3.1 with .pos_ attributes for a specific domain. The code below manages to compile, however, it is not returning the .pos_ attributes. How I could extract them?

import plac
import random
from pathlib import Path
import spacy
from spacy.training import Example

TAG_MAP = {
'N': {'pos': 'NOUN'},
'V': {'pos': 'VERB'},
'J': {'pos': 'ADJ'}
}

 TRAIN_DATA = [
('Eu gosto ovos cozidos', {'tags': ['N', 'V', 'N', 'J']}),
('Comer presunto azul', {'tags': ['V', 'N', 'J']})
]
@plac.annotations(
lang=("ISO Code of language to use", "option", "1", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),)

def main(lang="pt", output_dir="./output_2", n_iter=25):
    """Main function to create a new model, set up the pipeline and train 
    the tagger. In order to train the tagger with a custom tag map, 
    we're creating a new Language instance with a custom vocab.
    """
    nlp = spacy.blank(lang)
    tagger = nlp.add_pipe("tagger")

    for tag, values in TAG_MAP.items():
        tagger.add_label(tag) # tagger.add_label(tag, values) -> gives erro

    optimizer = nlp.begin_training()
    #optimizer = nlp.initialize()   
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:    
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], sgd=optimizer, losses=losses)
        print(losses)
    
    test_text = "Eu gosto ovos passados"        

    # Save model to output directory        
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
    
        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

if __name__ == "__main__":
    plac.call(main)

The last print returns:

Tags [('Eu', 'N', ''), ('gosto', 'V', ''), ('ovos', 'N', ''), ('passados', 'J', '')]


Solution 1:[1]

I am trying to make POS-tagginng with a classical sequence tagging model

from typing import List, Dict, Union
import pandas as pd
import numpy as np
import sklearn
import nltk
import spacy
#! python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

# Corpus visualization: NLTK Brown corpus
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')

After I did queries for words and sentences (sents)

brown: nltk.corpus.util.LazyCorpusLoader
print(brown.words())
# words(): list of str
sents = brown.tagged_sents(tagset="universal")
# tagged_sents(): list of (list of (str,str))
# type(sents)
sents[-1]

I got this result

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
[('From', 'ADP'),
 ('what', 'DET'),
 ('I', 'PRON'),
 ('was', 'VERB'),
 ('able', 'ADJ'),
 ('to', 'ADP'),
 ('gauge', 'NOUN'),
 ('in', 'ADP'),
 ('a', 'DET'),
 ('swift', 'ADJ'),
 (',', '.'),
 ('greedy', 'ADJ'),
 ('glance', 'NOUN'),
 (',', '.'),
 ('the', 'DET'),
 ('figure', 'NOUN'),
 ('inside', 'ADP'),
 ('the', 'DET'),
 ('coral-colored', 'ADJ'),
 ('boucle', 'NOUN'),
 ('dress', 'NOUN'),
 ('was', 'VERB'),
 ('stupefying', 'VERB'),
 ('.', '.')]

And I did Feature Extraction

def word2features(sent: spacy.tokens.Doc, i: int)  -> Dict[str, Union[str, bool]]:
    word = sent[i][0]
#     postag = sent[i][1]
    features: Dict[str, Union[str, bool]] = {}
    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()         
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()             
        })
    else:
        features['BOS'] = True
# if it is not  end of the sentance 
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
# if it is not  end of the sentance         
    else:
        features['EOS'] = True
    print(features)
    return features

And I test feature extractions to see if working properly

#sent  ==> list(str,str)
def sent2features(sent: spacy.tokens.Doc) -> List[Dict[str, Union[str, bool]]]: 
    print([word2features(sent, i) for i in range(len(sent))])
[ sent2features(sents[i]) for i in range(len(sents))]

And I got labels by

x: List[List[Dict[str, Union[str, bool]]]] = []   # returned values from sents2feachers
y: List[List[str]] = []    # return values from labels 
x = sents  # is this correct 
labels = [[s[1] for s in lis] for lis in sents]   # y = labels 

And Her I Split the data into train and test sets. fit the model CRFsuite (averaged perceptron)

  from sklearn.model_selection import train_test_split
    x_train: List[Dict[str, Union[str, bool]]] = []
    x_test: List[Dict[str, Union[str, bool]]] = []
    y_train: List[str] = []
    y_test: List[str] = []
    x_train,y_train,x_test,y_test = train_test_split(x,labels, test_size=0.1,shuffle=False)
    len(x_train) , len(y_train) , len(x_test) , len(y_test)

Now I am try to do training the POS tagging model

#!pip install scikit-learn<0.24
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True)
crf.fit(x_train, y_train)

I am using Gradient descent using the L-BFGS method as the algorithm I got this error See attached error

And last part is the Evaluation

y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=labels)
# group B and I results
sorted_labels = sorted( labels, key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

the expected Feature extraction I want to look like example The example sentence was: "This is a puppy." The resulting features are given below in a json formatted text:

[
  {
    "word.lower()": "this",
    "word[-3:]": "his",
    "word[-2:]": "is",
    "word.isupper()": false,
    "word.istitle()": true,
    "word.isdigit()": false,
    "BOS": true,
    "+1:word.lower()": "is",
    "+1:word.istitle()": false,
    "+1:word.isupper()": false
  },
  {
    "word.lower()": "is",
    "word[-3:]": "is",
    "word[-2:]": "is",
    "word.isupper()": false,
    "word.istitle()": false,
    "word.isdigit()": false,
    "-1:word.lower()": "this",
    "-1:word.istitle()": true,
    "-1:word.isupper()": false,
    "+1:word.lower()": "a",
    "+1:word.istitle()": false,
    "+1:word.isupper()": false
  },
  {
    "word.lower()": "a",
    "word[-3:]": "a",
    "word[-2:]": "a",
    "word.isupper()": false,
    "word.istitle()": false,
    "word.isdigit()": false,
    "-1:word.lower()": "is",
    "-1:word.istitle()": false,
    "-1:word.isupper()": false,
    "+1:word.lower()": "puppy",
    "+1:word.istitle()": false,
    "+1:word.isupper()": false
  },
  {
    "word.lower()": "puppy",
    "word[-3:]": "ppy",
    "word[-2:]": "py",
    "word.isupper()": false,
    "word.istitle()": false,
    "word.isdigit()": false,
    "-1:word.lower()": "a",
    "-1:word.istitle()": false,
    "-1:word.isupper()": false,
    "+1:word.lower()": ".",
    "+1:word.istitle()": false,
    "+1:word.isupper()": false
  },
  {
    "word.lower()": ".",
    "word[-3:]": ".",
    "word[-2:]": ".",
    "word.isupper()": false,
    "word.istitle()": false,
    "word.isdigit()": false,
    "-1:word.lower()": "puppy",
    "-1:word.istitle()": false,
    "-1:word.isupper()": false,
    "EOS": true
  }
]
[
  "DET",
  "AUX",
  "DET",
  "NOUN",
  "PUNCT"
]

By calling the following fun

import json
# sent = ["This", "is", "a", "puppy", "."]
sent = "This is a puppy."
f, p = preprocess_input(sent)
print(json.dumps(f, indent=2))
print(json.dumps(p, indent=2))

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 Mohammed