from itertools import chain
import nltk
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
import pycrfsuite
CoNLL2002 corpus is available in NLTK. We use Spanish data.
nltk.corpus.conll2002.fileids()
['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
CPU times: user 4.35 s, sys: 169 ms, total: 4.52 s Wall time: 4.52 s
Data format:
train_sents[0]
[('Melbourne', 'NP', 'B-LOC'), ('(', 'Fpa', 'O'), ('Australia', 'NP', 'B-LOC'), (')', 'Fpt', 'O'), (',', 'Fc', 'O'), ('25', 'Z', 'O'), ('may', 'NC', 'O'), ('(', 'Fpa', 'O'), ('EFE', 'NC', 'B-ORG'), (')', 'Fpt', 'O'), ('.', 'Fp', 'O')]
Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used.
This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.
def word2features(sent, i):
word = sent[i][0]
postag = sent[i][1]
features = [
'bias',
'word.lower=' + word.lower(),
'word[-3:]=' + word[-3:],
'word[-2:]=' + word[-2:],
'word.isupper=%s' % word.isupper(),
'word.istitle=%s' % word.istitle(),
'word.isdigit=%s' % word.isdigit(),
'postag=' + postag,
'postag[:2]=' + postag[:2],
]
if i > 0:
word1 = sent[i-1][0]
postag1 = sent[i-1][1]
features.extend([
'-1:word.lower=' + word1.lower(),
'-1:word.istitle=%s' % word1.istitle(),
'-1:word.isupper=%s' % word1.isupper(),
'-1:postag=' + postag1,
'-1:postag[:2]=' + postag1[:2],
])
else:
features.append('BOS')
if i < len(sent)-1:
word1 = sent[i+1][0]
postag1 = sent[i+1][1]
features.extend([
'+1:word.lower=' + word1.lower(),
'+1:word.istitle=%s' % word1.istitle(),
'+1:word.isupper=%s' % word1.isupper(),
'+1:postag=' + postag1,
'+1:postag[:2]=' + postag1[:2],
])
else:
features.append('EOS')
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, postag, label in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
This is what word2features extracts:
sent2features(train_sents[0])[0]
['bias', 'word.lower=melbourne', 'word[-3:]=rne', 'word[-2:]=ne', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NP', 'postag[:2]=NP', 'BOS', '+1:word.lower=(', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=Fpa', '+1:postag[:2]=Fp']
Extract the features from the data:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]
CPU times: user 3.83 s, sys: 343 ms, total: 4.17 s Wall time: 4.17 s
To train the model, we create pycrfsuite.Trainer, load the training data and call 'train' method.
# use elastic net regulatization; c2 is non-zero by default
trainer = pycrfsuite.Trainer('lbfgs', c1=1, max_iterations=100)
Load training data to CRFsuite:
%%time
for xseq, yseq in zip(X_train, y_train):
trainer.append_stringlists(xseq, yseq)
CPU times: user 4.62 s, sys: 76.5 ms, total: 4.69 s Wall time: 4.69 s
Train the model:
%%time
trainer.train('conll2002-esp.crfsuite')
CPU times: user 42.5 s, sys: 152 ms, total: 42.7 s Wall time: 42.7 s
trainer.train saves model to a file:
!ls -lh ./conll2002-esp.crfsuite
-rw-r--r-- 1 kmike staff 510K 30 апр 05:18 ./conll2002-esp.crfsuite
To use the trained model, create pycrfsuite.Tagger, open the model and use "tag" method:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')
<contextlib.closing at 0x1067dda20>
A quick check - labels should be our IOB tags:
tagger.labels()
['B-LOC', 'O', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']
Let's tag a sentence to see how it works:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)), end='\n\n')
print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct: ", ' '.join(sent2labels(example_sent)))
La Coruña , 23 may ( EFECOM ) . Predicted: B-LOC I-LOC O O O O B-ORG O O Correct: B-LOC I-LOC O O O O B-ORG O O
def bio_classification_report(y_true, y_pred):
"""
Classification report for a list of BIO-encoded sequences.
It computes token-level metrics and discards "O" labels.
"""
lb = LabelBinarizer()
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
tagset = set(lb.classes_) - {'O'}
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
return classification_report(
y_true_combined,
y_pred_combined,
labels = [class_indices[cls] for cls in tagset],
target_names = tagset,
)
Predict entity labels for all sentences in our testing set ('testb' Spanish data):
%%time
y_pred = [tagger.tag(xseq) for xseq in X_test]
CPU times: user 650 ms, sys: 22.1 ms, total: 672 ms Wall time: 672 ms
..and check the result. Note this report is not comparable to results in CONLL2002 papers because here we check per-token results (not per-entity). Per-entity numbers will be worse.
print(bio_classification_report(y_test, y_pred))
precision recall f1-score support B-LOC 0.75 0.74 0.74 1084 I-LOC 0.86 0.93 0.90 634 B-MISC 0.67 0.41 0.51 339 I-MISC 0.86 0.93 0.90 634 B-ORG 0.79 0.87 0.83 735 I-ORG 0.86 0.93 0.90 634 B-PER 0.59 0.51 0.55 557 I-PER 0.86 0.93 0.90 634 avg / total 0.79 0.80 0.79 5251