Now I’m not huge on big words, I believe in explaining things in my own understanding, and I discovered that people (newbies) tends to understand a concept better/faster when a fellow newbie explains a concept in the way he understands it, thereby breaking it to his level. But those who strive for the big technical definition can go research, they are all over the internet.
What is a Phrase Detector
A phrase detector is an implemented algorithm that uses several techniques with the most popular being the count collation (which is the technique used in gensim). This technique identifies common words that always occurs together with a minimum predefined frequency or according to gensim; threshold score, where the higher the threshold value, the stricter the selection process.
This model is used to identify phrases (bigrams) that are present in your texts, and is especially useful in building word vectors. Because it substantially reduce the computational complexity by reducing the number of vocabularies in your word vector, it does this by combining words that occurs together frequently into a single word.
import os
import re
from typing import List
from itertools import islice
from gensim import utils
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from nltk.tokenize import NLTKWordTokenizer, PunktSentenceTokenizer
view rawmodel.py
This model is trained on the whole series of the Malazan Book of the Fallen, not only is it an excellent resources to train your model, it is also a much more excellent read, or it can be any [or all] books that are presently in your computer. The books should be in [or converted to] the .txt format for easy loading, and they should all be in a folder called the Books with no subfolder containing any more text files.
class CustomPathLineSentences:
"""Custom implementaion of gensim.models.word2vec.PathLineSentences
It differs from gensim implementation in that it replaces the default
tokenizer with a more powerful tokenizer, while also adding more
functionalities to it.
Functionalities
1) Break the block of text into sentences using PunktSentenceTokenizer()
as each text is split on \n
2) For each sentence
a) Tokenize sentence using NLTKWordTokenizer()
i) Clean each tokens
b) Join words that constitute phrases into a single word if a
phrase detector model is passed as argument
c) yield up the preprocessed tokens for further processing
2)
Parameters
source: str
File path of the folder containing the text file
limit: int
The maximum number of characters to read in each text block
include_phrase: bool
If True group words that constitue phrase into a single word,
this should only be set to True if a phrase detector model has
been trained
phrase_model: phrase detector model
The model used in detecting phrases in text
if include_phrase is True and phrase_model is None, a ValueError
is raised,
"""
def __init__(self, source, limit=None,
include_phrase=False, phrase_model=None):
self.source = source
self.limit = limit
self.include_phrase = include_phrase
self.word_tokenizer = NLTKWordTokenizer()
self.sentence_tokenizer = PunktSentenceTokenizer()
if self.include_phrase and phrase_model is not None:
self.phrase_model = phrase_model
elif self.include_phrase and phrase_model is None:
raise ValueError("phrase model detector not provided")
if os.path.isfile(self.source):
print('This is a file, use a folder next time')
self.input_files = [self.source]
elif os.path.isdir(self.source):
self.source = os.path.join(self.source, '')
self.input_files = os.listdir(self.source)
self.input_files = [self.source + fname
for fname in self.input_files]
self.input_files.sort()
else:
raise ValueError('input is neither a file or a directory')
def __word_cleaner(self, word, cleaned_word_tokens, punctuation) -> List[str]:
"""For each word if any punctuation is still found in the
beginning and ending, further split them, ignore any
punctuation found in between the alphabet
"""
beginning_punc = None
ending_punc = None
if len(word) > 1:
if word[0] in punctuation:
beginning_punc = word[0]
word = word[1:]
if word[-1] in punctuation:
ending_punc = word[-1]
word = word[:-1]
if beginning_punc is not None:
cleaned_word_tokens.append(beginning_punc)
# For Some reason Jupyter notebook keep restarting
# because of this recursive code
# if word[0] in punctuation or word[-1] in punctuation:
# cleaned_word_tokens = self.__word_cleaner(word, cleaned_word_tokens,
# punctuation)
# else:
# cleaned_word_tokens.append(word)
cleaned_word_tokens.append(word)
if ending_punc is not None:
cleaned_word_tokens.append(ending_punc)
return cleaned_word_tokens
def clean_token_words(self, sentence) -> List[str]:
"""Split a sentence into tokens for further preprocessing"""
word_tokens: list = sentence.split()
cleaned_word_tokens = []
punctuation = string.punctuation + "’" + "‘"
for word in word_tokens:
if not self.include_phrase:
cleaned_word_tokens.append(word.strip(punctuation))
else:
self.__word_cleaner(word, cleaned_word_tokens, punctuation)
return cleaned_word_tokens
def __iter__(self):
"""Iterate through the files"""
pattern = re.compile("[‘’]")
total_count = 0
for fname in self.input_files:
with utils.open(fname, 'rb') as fin:
# iterate through the text using the inbuilt
# readline function
for line in islice(fin, self.limit):
line = utils.to_unicode(line).strip()
if line:
# text broken at the line break point may contain
# many sentences in it, use a sentence segmenter
# to further break them into sentences
sentences = self.sentence_tokenizer.tokenize(line)
# for each of those sentences break them into tokens
for sentence in sentences:
sentence = pattern.sub("'", sentence)
word_tokens = self.clean_token_words(sentence)
if not self.include_phrase:
yield word_tokens
else:
# combine detected words that consitutes phrases
# into a single word
generator = self.phrase_model.analyze_sentence(word_tokens)
yield [word[0] for word in generator]
to
def __len__(self):
counts = 0
for sentences in self.__iter__():
counts += 1
return counts
Ok, that was fun, you just created and added a handy tool to your tools belt, it will be useful to us in several scenarios [especially when building a word vector or a style inference model], so save it somewhere in a python text file and name the file utils.py.
I cheated a little, I built the
CustomPathLineSentences
function to be generic and have many use cases not specific to only this project, as you will see when I use it in building a word vector and training a style inference model.
What this class performs in summary is that, it become an iterator for us when instantiated, iterating through our text files and preprocessing it for us at the same time, leaving to us the more difficult task of training a phrase detector model. Which we are going to do below;
# train a phrase detector
def train_phrase_detector(*, threshold=400, reduce_model_memory_size=False):
sentences_iterator = CustomPathLineSentences('Books')
print("List of iles that will be analyzed for word phrase (bigrams)")
for file in sentences_iterator.input_files:
print(file)
phrases = Phrases(sentences_iterator, threshold=threshold,
connector_words=ENGLISH_CONNECTOR_WORDS)
print("Training completed")
return (phrases.freeze(), sentences_iterator) if reduce_model_memory_size
else (phrases, sentences_iterator)
We defined a function that will handle the task of training the model for us using the preset and default parameters. Next we will go on to train the model by executing the function.
threshold = 400
reduce_model_memory_size = False
phrase_model, sentences_iterator = train_phrase_detector(
threshold=threshold,
reduce_model_memory_size=reduce_model_memory_size)
# saving the trained model
fname = "malaz_phrase_detector"
phrase_model = phrase_model.save(fname)
Good we have finished training the model and saving it to disk for further use when we need to build a word vector and a style inference model, Let see what the model learned and test to see if it can detect word phrases in texts.
# print how many phrases the model detected in the trainng text
print(f"Total number of phrases (bigrams) detected: {len(phrase_model)}")
text = """The Foolish Dog Clan will join your companies on the other side,' Coltaine said. 'You and the Weasel Clan shall guard this side while the wounded and the refugees cross"""
# preprocess the text in the same way the training
# text was preprocessed
text_cleaned = sentences_iterator.clean_token_words(text)
# detect phrases (bigrams) in text
phrases_detected = phrase_model.analyze_sentence(text_cleaned)
print("Detected phrases")
for key, values in phrases_detected.items():
if values > 0:
print(key)
We have successfully built a phrase detector model, and the model have been tested on a text and it was able to successfully detects the phrases in the model.
This is important, this model can be general enough to detects phrases that follows the pattern of the training text [for example, the next series of an author novel if the model has been trained on the previous series]. But if the model is made to detect phrases on a completely different pattern of text, that does not even contain the same vocabulary as the training text, the model will fail woefully.
God loves you!
Top comments (0)