'PoS tagger model for a specific domain
I am trying to build a tagger model in spaCy v3.1 with .pos_ attributes for a specific domain. The code below manages to compile, however, it is not returning the .pos_ attributes. How I could extract them?
import plac
import random
from pathlib import Path
import spacy
from spacy.training import Example
TAG_MAP = {
'N': {'pos': 'NOUN'},
'V': {'pos': 'VERB'},
'J': {'pos': 'ADJ'}
}
TRAIN_DATA = [
('Eu gosto ovos cozidos', {'tags': ['N', 'V', 'N', 'J']}),
('Comer presunto azul', {'tags': ['V', 'N', 'J']})
]
@plac.annotations(
lang=("ISO Code of language to use", "option", "1", str),
output_dir=("Optional output directory", "option", "o", Path),
n_iter=("Number of training iterations", "option", "n", int),)
def main(lang="pt", output_dir="./output_2", n_iter=25):
"""Main function to create a new model, set up the pipeline and train
the tagger. In order to train the tagger with a custom tag map,
we're creating a new Language instance with a custom vocab.
"""
nlp = spacy.blank(lang)
tagger = nlp.add_pipe("tagger")
for tag, values in TAG_MAP.items():
tagger.add_label(tag) # tagger.add_label(tag, values) -> gives erro
optimizer = nlp.begin_training()
#optimizer = nlp.initialize()
for i in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
example = Example.from_dict(nlp.make_doc(text), annotations)
nlp.update([example], sgd=optimizer, losses=losses)
print(losses)
test_text = "Eu gosto ovos passados"
# Save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
# test the save model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2(test_text)
print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
if __name__ == "__main__":
plac.call(main)
The last print returns:
Tags [('Eu', 'N', ''), ('gosto', 'V', ''), ('ovos', 'N', ''), ('passados', 'J', '')]
Solution 1:[1]
I am trying to make POS-tagginng with a classical sequence tagging model
from typing import List, Dict, Union
import pandas as pd
import numpy as np
import sklearn
import nltk
import spacy
#! python3 -m spacy download en
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()
# Corpus visualization: NLTK Brown corpus
from nltk.corpus import brown
nltk.download('brown')
nltk.download('universal_tagset')
After I did queries for words and sentences (sents)
brown: nltk.corpus.util.LazyCorpusLoader
print(brown.words())
# words(): list of str
sents = brown.tagged_sents(tagset="universal")
# tagged_sents(): list of (list of (str,str))
# type(sents)
sents[-1]
I got this result
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
[('From', 'ADP'),
('what', 'DET'),
('I', 'PRON'),
('was', 'VERB'),
('able', 'ADJ'),
('to', 'ADP'),
('gauge', 'NOUN'),
('in', 'ADP'),
('a', 'DET'),
('swift', 'ADJ'),
(',', '.'),
('greedy', 'ADJ'),
('glance', 'NOUN'),
(',', '.'),
('the', 'DET'),
('figure', 'NOUN'),
('inside', 'ADP'),
('the', 'DET'),
('coral-colored', 'ADJ'),
('boucle', 'NOUN'),
('dress', 'NOUN'),
('was', 'VERB'),
('stupefying', 'VERB'),
('.', '.')]
And I did Feature Extraction
def word2features(sent: spacy.tokens.Doc, i: int) -> Dict[str, Union[str, bool]]:
word = sent[i][0]
# postag = sent[i][1]
features: Dict[str, Union[str, bool]] = {}
features = {
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit()
}
if i > 0:
word1 = sent[i-1][0]
postag1 = sent[i-1][1]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper()
})
else:
features['BOS'] = True
# if it is not end of the sentance
if i < len(sent)-1:
word1 = sent[i+1][0]
postag1 = sent[i+1][1]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper()
})
# if it is not end of the sentance
else:
features['EOS'] = True
print(features)
return features
And I test feature extractions to see if working properly
#sent ==> list(str,str)
def sent2features(sent: spacy.tokens.Doc) -> List[Dict[str, Union[str, bool]]]:
print([word2features(sent, i) for i in range(len(sent))])
[ sent2features(sents[i]) for i in range(len(sents))]
And I got labels by
x: List[List[Dict[str, Union[str, bool]]]] = [] # returned values from sents2feachers
y: List[List[str]] = [] # return values from labels
x = sents # is this correct
labels = [[s[1] for s in lis] for lis in sents] # y = labels
And Her I Split the data into train and test sets. fit the model CRFsuite (averaged perceptron)
from sklearn.model_selection import train_test_split
x_train: List[Dict[str, Union[str, bool]]] = []
x_test: List[Dict[str, Union[str, bool]]] = []
y_train: List[str] = []
y_test: List[str] = []
x_train,y_train,x_test,y_test = train_test_split(x,labels, test_size=0.1,shuffle=False)
len(x_train) , len(y_train) , len(x_test) , len(y_test)
Now I am try to do training the POS tagging model
#!pip install scikit-learn<0.24
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True)
crf.fit(x_train, y_train)
I am using Gradient descent using the L-BFGS method as the algorithm I got this error

And last part is the Evaluation
y_pred = crf.predict(x_test)
metrics.flat_f1_score(y_test, y_pred,average='weighted', labels=labels)
# group B and I results
sorted_labels = sorted( labels, key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))
the expected Feature extraction I want to look like example The example sentence was: "This is a puppy." The resulting features are given below in a json formatted text:
[
{
"word.lower()": "this",
"word[-3:]": "his",
"word[-2:]": "is",
"word.isupper()": false,
"word.istitle()": true,
"word.isdigit()": false,
"BOS": true,
"+1:word.lower()": "is",
"+1:word.istitle()": false,
"+1:word.isupper()": false
},
{
"word.lower()": "is",
"word[-3:]": "is",
"word[-2:]": "is",
"word.isupper()": false,
"word.istitle()": false,
"word.isdigit()": false,
"-1:word.lower()": "this",
"-1:word.istitle()": true,
"-1:word.isupper()": false,
"+1:word.lower()": "a",
"+1:word.istitle()": false,
"+1:word.isupper()": false
},
{
"word.lower()": "a",
"word[-3:]": "a",
"word[-2:]": "a",
"word.isupper()": false,
"word.istitle()": false,
"word.isdigit()": false,
"-1:word.lower()": "is",
"-1:word.istitle()": false,
"-1:word.isupper()": false,
"+1:word.lower()": "puppy",
"+1:word.istitle()": false,
"+1:word.isupper()": false
},
{
"word.lower()": "puppy",
"word[-3:]": "ppy",
"word[-2:]": "py",
"word.isupper()": false,
"word.istitle()": false,
"word.isdigit()": false,
"-1:word.lower()": "a",
"-1:word.istitle()": false,
"-1:word.isupper()": false,
"+1:word.lower()": ".",
"+1:word.istitle()": false,
"+1:word.isupper()": false
},
{
"word.lower()": ".",
"word[-3:]": ".",
"word[-2:]": ".",
"word.isupper()": false,
"word.istitle()": false,
"word.isdigit()": false,
"-1:word.lower()": "puppy",
"-1:word.istitle()": false,
"-1:word.isupper()": false,
"EOS": true
}
]
[
"DET",
"AUX",
"DET",
"NOUN",
"PUNCT"
]
By calling the following fun
import json
# sent = ["This", "is", "a", "puppy", "."]
sent = "This is a puppy."
f, p = preprocess_input(sent)
print(json.dumps(f, indent=2))
print(json.dumps(p, indent=2))
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|---|
| Solution 1 | Mohammed |
