This is an attempt to do a sentiment classification for the movies. We will build a binary linear classifier that reads movie reviews, as you would see from the sites like Rotten Tomatoes, and predicts whether the result is positive or negative
The attempt is to classified given movie reviews as ”positive” and ”negative”, Here, we will create a simple text classification system that can perform this task automatically.
Here’s some preperation:
import random
from typing import Callable, Dict, List, Tuple, TypeVar, DefaultDict
from util import *
FeatureVector = Dict[str, int]
WeightVector = Dict[str, float]
Example = Tuple[FeatureVector, int]
Binary classification
def extractWordFeatures(x: str) -> FeatureVector:
"""
Extract word features for a string x. Words are delimited by
whitespace characters only.
@param string x:
@return dict: feature vector representation of x.
Example: "I am what I am" --> {'I': 2, 'am': 2, 'what': 1}
"""
pass
import string
# Convert the review to lowercase
x = x.lower()
# Remove punctuation from the text
x = x.translate(str.maketrans("", "", string.punctuation))
# Tokenize the text (split into words)
words = x.split()
# Create an empty dictionary to store word features
word_features = {}
# Iterate through the words and add them to the dictionary with their counts
for word in words:
if word in word_features:
word_features[word] += 1
else:
word_features[word] = 1
return word_features
Stochastic Gradient Descent:
T = TypeVar("T")
def learnPredictor(
trainExamples: List[Tuple[T, int]],
validationExamples: List[Tuple[T, int]],
featureExtractor: Callable[[T], FeatureVector],
numEpochs: int,
eta: float,
) -> WeightVector:
weights = {} # feature => weight
def dotProduct(d1, d2):
"""
@param dict d1: a feature vector represented by a mapping from a feature (string) to a weight (float).
@param dict d2: same as d1
@return float: the dot product between d1 and d2
"""
if len(d1) < len(d2):
return dotProduct(d2, d1)
else:
return sum(d1.get(f, 0) * v for f, v in d2.items())
weights = {} # feature => weight
for epoch in range(numEpochs):
for x, y in trainExamples:
phi = featureExtractor(x)
dot_prod = dotProduct(weights, phi)
if y * dot_prod <= 1:
for feature in phi:
if feature not in weights:
weights[feature] = 0
weights[feature] += eta * y * phi[feature]
train_error = evaluatePredictor(
trainExamples,
lambda x: 1 if dotProduct(featureExtractor(x), weights) >= 0 else -1,
)
validation_error = evaluatePredictor(
validationExamples,
lambda x: 1 if dotProduct(featureExtractor(x), weights) >= 0 else -1,
)
print(
f"Epoch {epoch + 1}: Training error {train_error}, Validation error {validation_error}"
)
return weights
Create an toy dataset using generateExample function and use as a test case for learnPredictor.
def generateDataset(numExamples: int, weights: WeightVector) -> List[Example]:
def generateExample() -> Tuple[Dict[str, int], int]:
phi = None
y = None
while True:
phi = {}
for key in weights.keys():
if (
random.random() < 0.5
): # Randomly select a subset of keys from weights
phi[key] = random.randint(
-10, 10
) # Assign random integer values to the selected keys
score = dotProduct(phi, weights)
if score != 0:
break
y = 1 if score >= 0 else -1
# Test that the randomly generated example coincides with the given weights
predicted_y = 1 if dotProduct(weights, phi) >= 0 else -1
assert (
predicted_y == y
), f"Predicted label {predicted_y} does not match the generated label {y}"
return (phi, y)
return [generateExample() for _ in range(numExamples)]
Some more character extraction to handel edge cases
def extractCharacterFeatures(n: int) -> Callable[[str], FeatureVector]:
def extract(x):
pass
x = x.replace(" ", "") # Remove spaces from the input string
features = {}
# Iterate through the string and count the occurrences of each n-gram
for i in range(len(x) - n + 1):
ngram = x[i : i + n]
if ngram in features:
features[ngram] += 1
else:
features[ngram] = 1
return features
return extract
Finally, a test function to test different values of n for extractCharacterFeatures
def testValuesOfN(n: int):
trainExamples = readExamples("polarity.train")
validationExamples = readExamples("polarity.dev")
featureExtractor = extractCharacterFeatures(n)
weights = learnPredictor(
trainExamples, validationExamples, featureExtractor, numEpochs=20, eta=0.01
)
outputWeights(weights, "weights")
outputErrorAnalysis(
validationExamples, featureExtractor, weights, "error-analysis"
) # Use this to debug
trainError = evaluatePredictor(
trainExamples,
lambda x: (1 if dotProduct(featureExtractor(x), weights) >= 0 else -1),
)
validationError = evaluatePredictor(
validationExamples,
lambda x: (1 if dotProduct(featureExtractor(x), weights) >= 0 else -1),
)
print(
(
"Official: train error = %s, validation error = %s"
% (trainError, validationError)
)
)