This is an attempt to build a sentiment classifier for movie reviews. We’ll create a binary linear classifier that reads reviews, like you’d see on Rotten Tomatoes, and predicts whether the sentiment is positive or negative.

The goal is to classify given movie reviews as “positive” or “negative” using a simple text classification system built from scratch.

Preparation

import random
from typing import Callable, Dict, List, Tuple, TypeVar, DefaultDict
from util import *

FeatureVector = Dict[str, int]
WeightVector = Dict[str, float]
Example = Tuple[FeatureVector, int]

Binary Classification

def extractWordFeatures(x: str) -> FeatureVector:
    """
    Extract word features for a string x. Words are delimited by
    whitespace characters only.
    @param string x:
    @return dict: feature vector representation of x.
    Example: "I am what I am" --> {'I': 2, 'am': 2, 'what': 1}
    """
    import string

    # Convert the review to lowercase
    x = x.lower()

    # Remove punctuation from the text
    x = x.translate(str.maketrans("", "", string.punctuation))

    # Tokenize the text (split into words)
    words = x.split()

    # Create an empty dictionary to store word features
    word_features = {}

    # Iterate through the words and add them to the dictionary with their counts
    for word in words:
        if word in word_features:
            word_features[word] += 1
        else:
            word_features[word] = 1

    return word_features

Stochastic Gradient Descent

T = TypeVar("T")


def learnPredictor(
    trainExamples: List[Tuple[T, int]],
    validationExamples: List[Tuple[T, int]],
    featureExtractor: Callable[[T], FeatureVector],
    numEpochs: int,
    eta: float,
) -> WeightVector:
    
    weights = {}  # feature => weight

    def dotProduct(d1, d2):
        """
        @param dict d1: a feature vector represented by a mapping from a feature (string) to a weight (float).
        @param dict d2: same as d1
        @return float: the dot product between d1 and d2
        """
        if len(d1) < len(d2):
            return dotProduct(d2, d1)
        else:
            return sum(d1.get(f, 0) * v for f, v in d2.items())

    weights = {}  # feature => weight

    for epoch in range(numEpochs):
        for x, y in trainExamples:
            phi = featureExtractor(x)
            dot_prod = dotProduct(weights, phi)

            if y * dot_prod <= 1:
                for feature in phi:
                    if feature not in weights:
                        weights[feature] = 0
                    weights[feature] += eta * y * phi[feature]

        train_error = evaluatePredictor(
            trainExamples,
            lambda x: 1 if dotProduct(featureExtractor(x), weights) >= 0 else -1,
        )
        validation_error = evaluatePredictor(
            validationExamples,
            lambda x: 1 if dotProduct(featureExtractor(x), weights) >= 0 else -1,
        )

        print(
            f"Epoch {epoch + 1}: Training error {train_error}, Validation error {validation_error}"
        )

    return weights

Toy Dataset

Create a toy dataset using the generateExample function as a test case for learnPredictor.

def generateDataset(numExamples: int, weights: WeightVector) -> List[Example]:
    
    def generateExample() -> Tuple[Dict[str, int], int]:
        phi = None
        y = None
        while True:
            phi = {}
            for key in weights.keys():
                if (
                    random.random() < 0.5
                ):  # Randomly select a subset of keys from weights
                    phi[key] = random.randint(
                        -10, 10
                    )  # Assign random integer values to the selected keys

            score = dotProduct(phi, weights)
            if score != 0:
                break

        y = 1 if score >= 0 else -1

        # Test that the randomly generated example coincides with the given weights
        predicted_y = 1 if dotProduct(weights, phi) >= 0 else -1
        assert (
            predicted_y == y
        ), f"Predicted label {predicted_y} does not match the generated label {y}"

        return (phi, y)

    return [generateExample() for _ in range(numExamples)]

Character N-gram Features

Some additional feature extraction to handle edge cases.

def extractCharacterFeatures(n: int) -> Callable[[str], FeatureVector]:

    def extract(x):
        x = x.replace(" ", "")  # Remove spaces from the input string
        features = {}

        # Iterate through the string and count the occurrences of each n-gram
        for i in range(len(x) - n + 1):
            ngram = x[i : i + n]
            if ngram in features:
                features[ngram] += 1
            else:
                features[ngram] = 1

        return features

    return extract

Testing Different Values of N

Finally, a test function to evaluate different values of n for extractCharacterFeatures.

def testValuesOfN(n: int):
    trainExamples = readExamples("polarity.train")
    validationExamples = readExamples("polarity.dev")
    featureExtractor = extractCharacterFeatures(n)
    weights = learnPredictor(
        trainExamples, validationExamples, featureExtractor, numEpochs=20, eta=0.01
    )
    outputWeights(weights, "weights")
    outputErrorAnalysis(
        validationExamples, featureExtractor, weights, "error-analysis"
    )  # Use this to debug
    trainError = evaluatePredictor(
        trainExamples,
        lambda x: (1 if dotProduct(featureExtractor(x), weights) >= 0 else -1),
    )
    validationError = evaluatePredictor(
        validationExamples,
        lambda x: (1 if dotProduct(featureExtractor(x), weights) >= 0 else -1),
    )
    print(
        (
            "Official: train error = %s, validation error = %s"
            % (trainError, validationError)
        )
    )