The detected language is
Implementation
class NGram {
constructor(n) {
this.n = n;
this.freqMap = new Map();
}
generate(text) {
text = text.toLowerCase().replace(/\n/g, "");
let gram = " ".repeat(this.n);
this.addGram(gram);
for (let i = 0; i < text.length; i++) {
gram = gram.slice(1) + text[i];
this.addGram(gram);
}
}
addGram(gram) {
const freq = this.freqMap.get(gram) ?? 0;
this.freqMap.set(gram, freq + 1);
}
getFrequencies() {
return this.freqMap;
}
getTopFrequencies(k) {
return new Map(
Array.from(this.freqMap.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, k)
);
}
}
class NaiveBayes {
constructor() {
this.classes = new Map();
this.k = 300;
}
_getTopKNGrams(text, n) {
const ngram = new NGram(n);
ngram.generate(text);
return ngram.getTopFrequencies(this.k);
}
_getProfile(text) {
return {
bigrams: this._getTopKNGrams(text, 2),
trigrams: this._getTopKNGrams(text, 3),
quadgrams: this._getTopKNGrams(text, 4)
};
}
train(text, label) {
this.classes.set(label, this._getProfile(text));
}
_calculateClassProbability(textNGrams, classNGrams, n) {
let classProbability = 0;
for (const [ngram, count] of textNGrams.entries()) {
const classCount = classNGrams.get(ngram) || 0;
classProbability += Math.log2(
(classCount + 1) / (classNGrams.size + Math.pow(2, n))
);
}
return classProbability;
}
predict(text) {
const {
bigrams: textBigrams,
trigrams: textTrigrams,
quadgrams: textQuadgrams
} = this._getProfile(text);
const classProbabilities = new Map();
for (const [
className,
{
bigrams: classBigrams,
trigrams: classTrigrams,
quadgrams: classQuadgrams
}
] of this.classes) {
let classProbability = Math.log2(this.getClassPrior(className));
classProbability += this._calculateClassProbability(
textBigrams,
classBigrams,
2
);
classProbability += this._calculateClassProbability(
textTrigrams,
classTrigrams,
3
);
classProbability += this._calculateClassProbability(
textQuadgrams,
classQuadgrams,
4
);
classProbabilities.set(className, classProbability);
}
const predictedClass = _.maxBy(
Array.from(classProbabilities),
([className, probability]) => probability
);
return predictedClass;
}
predictDetailed(text) {
const {
bigrams: textBigrams,
trigrams: textTrigrams,
quadgrams: textQuadgrams
} = this._getProfile(text);
const classProbabilities = new Map();
for (const [
className,
{
bigrams: classBigrams,
trigrams: classTrigrams,
quadgrams: classQuadgrams
}
] of this.classes) {
let classProbability = Math.log2(this.getClassPrior(className));
const bigramProbability = this._calculateClassProbability(
textBigrams,
classBigrams,
2
);
const trigramProbability = this._calculateClassProbability(
textTrigrams,
classTrigrams,
3
);
const quadgramProbability = this._calculateClassProbability(
textQuadgrams,
classQuadgrams,
4
);
classProbability +=
bigramProbability + trigramProbability + quadgramProbability;
classProbabilities.set(className, {
probability: classProbability,
bigramProbability,
trigramProbability,
quadgramProbability
});
}
return classProbabilities;
}
getClassPrior(className) {
const classCount = this.classes.size;
return (this.classes.has(className) ? 1 : 0) / classCount;
}
}
const classifier = new NaiveBayes();
languages.forEach(({ code, text }) => classifier.train(text, code));
const detectLanguage = (text) =>
languages.find((l) => l.code === classifier.predict(text)?.[0]);