Можно попробовать использовать ANTLR, описание примера тут.
Либо пойти другим путём — выявлять косинусную близость, используя модель word2vec:
import gensim
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models.keyedvectors import KeyedVectors
from numpy import dot
from numpy.linalg import norm
def cosine_sim(a,b):
return dot(a, b)/(norm(a)*norm(b))
# load the w2v model
path_pretraind_model='./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin' #set as the path of pretraind model
model = KeyedVectors.load_word2vec_format(path_pretraind_model, binary=True)
wlist = ['portugal', 'spain', 'belgium', 'country', 'netherlands', 'italy']
lenwlist=len(wlist)
avrsim=[]
#compute cosine similarity between each word in wlist with the other words in wlist
for i in range(lenwlist):
word=wlist[i]
totalsim=0
wordembed=model[word]
for j in range(lenwlist):
if i!=j:
word2embed=model[wlist[j]]
totalsim+=cosine_sim(wordembed, word2embed)
avrsim.append(totalsim/ (lenwlist-1)) #add the average similarity between word and any other words in wlist
index_min=avrsim.index(min(avrsim)) #get min similarity
print(wlist[index_min])
Алексей Гайдабура, студент Skillbox