Building Small Language Model

This demo is heavily influenced by Karpathy's tutorial

We are conducting AI Engineer Bootcamp, a 3-month program to help you become an AI Engineer. If you are interested, please check here. The registration for the next batch will be closed soon.

Load Dataset

https://opendata.kpu.go.id/dataset/0bfd84b3f-fa1e47b36-dd6d5a452-73feb

import pandas as pd

df = pd.read_csv('jabar.csv', sep=',', header=None, names=['dapil', 'provinsi', 'nama_dapil', 'nomor_partai', 'nama_partai', 'nomor_urut', 'nama_caleg', 'jenis_kelamin'])
names = df['nama_caleg']
names

Data Cleansing

names.describe()

# remove the header
names = names[1:]
names

# remove character after comma
names = names.str.split(',').str[0]
names

# remove character before the last dot
names = names.str.split('.').str[-1]
names

# remove trailing and leading whitespace
names = names.str.strip()
names

# upper case
names = names.str.upper()
names

# get all unique chars in the names
unique_chars = sorted(list(set(''.join(names))))
unique_chars

# remove (, ), -, 0, \x81, ™
names = names.str.replace(r'[\(\)\-0\x81™]', '', regex=True)

# get all unique chars in the names
unique_chars = sorted(list(set(''.join(names))))
unique_chars

Convert to Numbers

# define the vocabs
vocabs = sorted(list(set(''.join(names))))
vocabs

# add <.> to the vocabs
vocabs = ['<.>'] + vocabs
vocabs

# map to numbers and vice versa
char_to_num = {char: i for i, char in enumerate(vocabs)}
num_to_char = {i: char for i, char in enumerate(vocabs)}

char_to_num

char_to_num

Generate Pairs

# generate pair matrix
pairs = [[0 for _ in range(len(vocabs))] for _ in range(len(vocabs))]

# draw pair matrix
def draw_pair_matrix(pairs):
    # the first line is the label
    print('\t' + '\t'.join([num_to_char[i] for i in range(len(pairs))]))

    for i in range(len(pairs)):
        # the first column is the label
        print(num_to_char[i], end='\t')
        print('\t'.join([str(p) for p in pairs[i]]))

draw_pair_matrix(pairs)

# Count the pairs
for name in names:
    chars = list(name)
    # add <start> and <end> token
    chars = ['<.>'] + chars + ['<.>']

    for i in range(len(chars)-1):
        # ab
        a = chars[i]
        b = chars[i+1]
        pairs[char_to_num[a]][char_to_num[b]] += 1

draw_pair_matrix(pairs)

def predict_next_letter(letter, pairs):
    next_letter = max(range(len(pairs[0])), key=lambda i: pairs[char_to_num[letter]][i])
    return num_to_char[next_letter]

def generate_next_letters(first_letter, pairs):
    letter = first_letter

    result = ''
    result += letter

    while True:
        # print(letter)
        # print('\t'.join([num_to_char[i] for i in range(len(pairs))]))
        # print('\t'.join(map(str, pairs[char_to_num[letter]])))
        letter = predict_next_letter(letter, pairs)
        if letter == '<.>':
            break
        result += letter
        # print()

    return result

generate_next_letters('S', pairs)

generate_next_letters('A', pairs)

Non-deterministic/Sampling

import torch

# convert pairs to tensor
pairs_tensor = torch.tensor(pairs, dtype=torch.float)
pairs_tensor

# convert into probability
prob_tensor = pairs_tensor / pairs_tensor.sum(dim=1, keepdim=True)
prob_tensor

prob_tensor[0]

prob_tensor[0].sum()

# do again, but with tensor probability
generate_next_letters('S', prob_tensor)

# convert prob_tensor into multinoial distribution
pairs_multinomial = torch.multinomial(prob_tensor, 1, replacement=True)
pairs_multinomial

pairs_multinomial[char_to_num['V']]

num_to_char[pairs_multinomial[char_to_num['V']].item()]

# replace predict_next_letter with pairs_multinomial
def predict_next_letter_multinomial(letter, pairs):
    idx = char_to_num[letter]
    sampling = torch.multinomial(pairs[idx], 1, replacement=True)
    return num_to_char[sampling.item()]

predict_next_letter_multinomial('V', prob_tensor)

def generate_next_letters(first_letter, pairs):
    letter = first_letter

    result = ''
    result += letter

    while True:
        # print(letter)
        # print('\t'.join([num_to_char[i] for i in range(len(pairs))]))
        # print('\t'.join(map(str, pairs[char_to_num[letter]])))
        letter = predict_next_letter_multinomial(letter, pairs)
        if letter == '<.>':
            break
        result += letter
        # print()

    return result

generate_next_letters('S', prob_tensor)

Trigram

Our bigram model is not enough to capture the context of the words. How about if we add more context by using trigram model?

triplets = [[[0 for _ in range(len(vocabs))] for _ in range(len(vocabs))] for _ in range(len(vocabs))]

for name in names:
    chars = list(name)
    # add <start> token and double <end> tokens to form valid ending trigrams
    chars = ['<.>'] + chars + ['<.>', '<.>']

    for i in range(len(chars)-2):  # Adjust loop to ensure we have triplets
        # abc
        a = chars[i]
        b = chars[i+1]
        c = chars[i+2]
        triplets[char_to_num[a]][char_to_num[b]][char_to_num[c]] += 1

draw_pair_matrix(triplets[char_to_num['S']])

import torch
triplets_tensor = torch.tensor(triplets, dtype=torch.float)

next_probs = triplets_tensor[char_to_num['S']].sum(dim=1)/triplets_tensor[char_to_num['S']].sum()
next_probs

def predict_next_letter_trigram(letter1, letter2, triplets_tensor):
    # calculate the probability
    idx1 = char_to_num[letter1]
    idx2 = char_to_num[letter2]
    next_probs = triplets_tensor[idx1][idx2]/triplets_tensor[idx1][idx2].sum()
    sampling = torch.multinomial(next_probs, 1, replacement=True)
    return num_to_char[sampling.item()]

predict_next_letter_trigram('S', 'U', triplets_tensor)

def generate_next_letters_trigram(first_letter, second_letter, triplets):
    letter1 = first_letter
    letter2 = second_letter

    result = ''
    result += letter1
    result += letter2

    while True:
        # print(letter)
        # print('\t'.join([num_to_char[i] for i in range(len(pairs))]))
        # print('\t'.join(map(str, pairs[char_to_num[letter]])))
        letter = predict_next_letter_trigram(letter1, letter2, triplets)
        if letter == '<.>':
            break

        letter1 = letter2
        letter2 = letter

        result += letter
        # print()

    return result

generate_next_letters_trigram('S', 'I', triplets_tensor)

Quality Assessment

How do we know whether the generated text is good or not?

prob_tensor[char_to_num['S']]

# Max probability
prob_tensor[char_to_num['S']].max()

# Probability distribution, plot it
import matplotlib.pyplot as plt

plt.plot(prob_tensor[char_to_num['S']])
plt.show()

# to get the prediciton, take the argmax
prediction = prob_tensor[char_to_num['S']].argmax()
num_to_char[prediction.item()]

The best probability is 1. And the worst is 0.

# draw log plot from x = 0 to 1
import numpy as np

x = np.linspace(0, 1, 100)
y = np.log(x)
plt.plot(x, y)
plt.show()

What we are intereseted in is the loss function.

# draw -log plot from x = 0 to 1
x = np.linspace(0, 1, 100)
y = -np.log(x)
plt.plot(x, y)
plt.show()

# Probability distribution, plot it
import matplotlib.pyplot as plt

nll = -torch.log(prob_tensor[char_to_num['S']])
plt.plot(nll)
plt.show()

letters = "SUSI"
nll_pred_bigram = 0
nll_my_letter = 0

for i in range(len(letters)-1):
    print(letters[i])

    cur_prob = prob_tensor[char_to_num[letters[i]]]
    prediction = cur_prob.argmax().item()
    print("The prediction is", num_to_char[prediction], " with probability", cur_prob[prediction])
    print("The expected is", letters[i+1], " with probability", cur_prob[char_to_num[letters[i+1]]])

    nll_pred_bigram += -torch.log(cur_prob[prediction])
    nll_my_letter += -torch.log(cur_prob[char_to_num[letters[i+1]]])
    print()

print("-----")
print("The NLL for prediction is", nll_pred_bigram)
print("The NLL for my letter is", nll_my_letter)

Embedding & Neural Network

names

xs = []
ys = []

for name in names:
    chars = list(name)
    # add <start> and <end> token
    chars = ['<.>'] + chars + ['<.>']

    for i in range(len(chars)-1):
        # ab
        xs.append(chars[i])
        ys.append(chars[i+1])

list(zip(xs, ys))

# but we want xs and ys to be numbers
xs = [char_to_num[x] for x in xs]
ys = [char_to_num[y] for y in ys]

list(zip(xs, ys))

# convert into tensor
xs = torch.tensor(xs)
ys = torch.tensor(ys)

import torch.nn as nn

embed = nn.Embedding(len(vocabs), len(vocabs))
# print the embedding
print(embed.weight)

# Wait, isn't it similar to our bigram pairs?
# get the first row in the embed
embed(torch.tensor([0]))

# the second row
embed(torch.tensor([1]))

epoch = 100

embed = nn.Embedding(len(vocabs), len(vocabs))
optimizer = torch.optim.Adam(embed.parameters(), lr=0.1)

for _ in range(epoch):
    # zero the gradients
    optimizer.zero_grad()

    # get the prediction
    counts = embed(xs).exp()
    probs = counts / counts.sum(dim=1, keepdim=True)
    loss = -torch.log(probs[range(len(ys)), ys]).mean()

    # backprop
    loss.backward()

    # update the weights
    optimizer.step()

    print(loss.item())

idx = torch.tensor([char_to_num['S']])
counts = embed(idx).exp()
probs = counts / counts.sum(dim=1, keepdim=True)

# plot the probability
plt.plot(probs[0].detach().numpy())
plt.show()

# take the max probability
prediction = probs.argmax().item()
num_to_char[prediction]

def predict_next_letter_nn(letter, embed):
    idx = torch.tensor([char_to_num[letter]])
    counts = embed(idx).exp()
    probs = counts / counts.sum(dim=1, keepdim=True)

    return num_to_char[probs.argmax().item()]

predict_next_letter_nn('S', embed)

predict_next_letter_nn('U', embed)

predict_next_letter_nn('R', embed)

Building Small Language Model

Load Dataset​

Data Cleansing​

Convert to Numbers​

Generate Pairs​

Non-deterministic/Sampling​

Trigram​

Quality Assessment​

Embedding & Neural Network​