Wednesday, February 22, 2023

Bayes Sandbox

 WinkNLP has a simple but powerful text classifier using Bayesian analysis. I have working code. 

const winkNLP = require('wink-nlp');
const model = require('wink-eng-lite-web-model');
const fs = require('fs');

const nlp = winkNLP(model);

const text = fs.readFileSync('input.txt', 'utf-8');

const patterns = [
  {
    name: 'nounPhrase',
    patterns: [ '[|DET] [|ADJ] [NOUN|PROPN]' ]
  },
];

nlp.learnCustomEntities(patterns);

const doc = nlp.readDoc(text);
const entities = doc.customEntities().out();

fs.writeFile('noun_phrases.txt', entities.join('\n'), (err) => {
  if (err) throw err;
  console.log('The output has been written to output.txt');
});


Okay, this is clunky, but functioning. You will need to run this code through twice, once to create a textfile called noun_phrases.txt and another to create a textfile called verb_phrases.txt.

Now, we want to do the same thing, but create a file called verb_phrases.txt. To make this as simple as possible, I've provided a separate file. Clunky, I know. 

const winkNLP = require('wink-nlp');
const model = require('wink-eng-lite-web-model');
const fs = require('fs');

const nlp = winkNLP(model);

const text = fs.readFileSync('input.txt', 'utf-8');

const patterns = [
  {
    name: 'verbPhrase',
    patterns: [ '[|ADV] [|PARTICLE] [|ADJ] [|NOUN] [VERB]' ]
  }
];

nlp.learnCustomEntities(patterns);

const doc = nlp.readDoc(text);
const entities = doc.customEntities().out();

fs.writeFile('verb_phrases.txt', entities.join('\n'), (err) => {
  if (err) throw err;
  console.log('The output has been written to output.txt');
});


Obviously, you can just make the edits yourself. I have to preprocess the data to get rid of the clunkiness. This is an epic amount of clunkiness.

// Load Naive Bayes Text Classifier
var Classifier = require('wink-naive-bayes-text-classifier');
// Instantiate
var nbc = Classifier();
// Load wink nlp and its model
const winkNLP = require('wink-nlp');
// Load language model
const model = require('wink-eng-lite-web-model');
const nlp = winkNLP(model);
const its = nlp.its;
const fs = require('fs');

// Function to read and preprocess the contents of the file
const readAndPreprocessFile = function (filePath) {
  const fileContents = fs.readFileSync(filePath, 'utf-8');
  const nounPhrases = fileContents.split('\n');
  return nounPhrases;
};

// Define a pre-processing task
const prepTask = function (text) {
  const tokens = [];
  nlp
    .readDoc(text)
    .tokens()
    // Use only words ignoring punctuations etc and from them remove stop words
    .filter((t) => t.out(its.type) === 'word' && !t.out(its.stopWordFlag))
    // Handle negation and extract stem of the word
    .each((t) =>
      tokens.push(t.out(its.negationFlag) ? '!' + t.out(its.stem) : t.out(its.stem))
    );
  return tokens;
};

// Define the pre-processing task for the classifier
nbc.definePrepTasks([prepTask]);

// Configure behavior
nbc.defineConfig({ considerOnlyPresence: true, smoothingFactor: 0.5 });

// Read the file and train the classifier
const nounPhrases = readAndPreprocessFile('noun_phrases.txt');
nounPhrases.forEach((np) => nbc.learn(np, 'nounPhrase'));

const verbPhrases = readAndPreprocessFile('verb_phrases.txt');
verbPhrases.forEach((vp) => nbc.learn(vp, 'verbPhrase'));

nbc.consolidate();

// The classifier is now trained and can be used to make predictions.


console.log( nbc.predict( 'failing stars' ) );

The expected output is verbPhrase.


No comments:

Post a Comment