Skip to main content

Building Small Language Model


This demo is heavily influenced by Karpathy's tutorial

We are conducting AI Engineer Bootcamp, a 3-month program to help you become an AI Engineer. If you are interested, please check here. The registration for the next batch will be closed soon.

Load Dataset

https://opendata.kpu.go.id/dataset/0bfd84b3f-fa1e47b36-dd6d5a452-73feb

import pandas as pd

df = pd.read_csv('jabar.csv', sep=',', header=None, names=['dapil', 'provinsi', 'nama_dapil', 'nomor_partai', 'nama_partai', 'nomor_urut', 'nama_caleg', 'jenis_kelamin'])
names = df['nama_caleg']
names

Data Cleansing

names.describe()
# remove the header
names = names[1:]
names
# remove character after comma
names = names.str.split(',').str[0]
names
# remove character before the last dot
names = names.str.split('.').str[-1]
names

# remove trailing and leading whitespace
names = names.str.strip()
names
# upper case
names = names.str.upper()
names
# get all unique chars in the names
unique_chars = sorted(list(set(''.join(names))))
unique_chars
# remove (, ), -, 0, \x81, ™
names = names.str.replace(r'[\(\)\-0\x81™]', '', regex=True)

# get all unique chars in the names
unique_chars = sorted(list(set(''.join(names))))
unique_chars

Convert to Numbers

# define the vocabs
vocabs = sorted(list(set(''.join(names))))
vocabs
# add <.> to the vocabs
vocabs = ['<.>'] + vocabs
vocabs
# map to numbers and vice versa
char_to_num = {char: i for i, char in enumerate(vocabs)}
num_to_char = {i: char for i, char in enumerate(vocabs)}

char_to_num
char_to_num

Generate Pairs

# generate pair matrix
pairs = [[0 for _ in range(len(vocabs))] for _ in range(len(vocabs))]

# draw pair matrix
def draw_pair_matrix(pairs):
# the first line is the label
print('\t' + '\t'.join([num_to_char[i] for i in range(len(pairs))]))

for i in range(len(pairs)):
# the first column is the label
print(num_to_char[i], end='\t')
print('\t'.join([str(p) for p in pairs[i]]))

draw_pair_matrix(pairs)
# Count the pairs
for name in names:
chars = list(name)
# add <start> and <end> token
chars = ['<.>'] + chars + ['<.>']

for i in range(len(chars)-1):
# ab
a = chars[i]
b = chars[i+1]
pairs[char_to_num[a]][char_to_num[b]] += 1

draw_pair_matrix(pairs)
def predict_next_letter(letter, pairs):
next_letter = max(range(len(pairs[0])), key=lambda i: pairs[char_to_num[letter]][i])
return num_to_char[next_letter]

def generate_next_letters(first_letter, pairs):
letter = first_letter

result = ''
result += letter

while True:
# print(letter)
# print('\t'.join([num_to_char[i] for i in range(len(pairs))]))
# print('\t'.join(map(str, pairs[char_to_num[letter]])))
letter = predict_next_letter(letter, pairs)
if letter == '<.>':
break
result += letter
# print()

return result

generate_next_letters('S', pairs)
generate_next_letters('A', pairs)

Non-deterministic/Sampling


import torch

# convert pairs to tensor
pairs_tensor = torch.tensor(pairs, dtype=torch.float)
pairs_tensor
# convert into probability
prob_tensor = pairs_tensor / pairs_tensor.sum(dim=1, keepdim=True)
prob_tensor
prob_tensor[0]
prob_tensor[0].sum()
# do again, but with tensor probability
generate_next_letters('S', prob_tensor)
# convert prob_tensor into multinoial distribution
pairs_multinomial = torch.multinomial(prob_tensor, 1, replacement=True)
pairs_multinomial
pairs_multinomial[char_to_num['V']]
num_to_char[pairs_multinomial[char_to_num['V']].item()]
# replace predict_next_letter with pairs_multinomial
def predict_next_letter_multinomial(letter, pairs):
idx = char_to_num[letter]
sampling = torch.multinomial(pairs[idx], 1, replacement=True)
return num_to_char[sampling.item()]

predict_next_letter_multinomial('V', prob_tensor)
def generate_next_letters(first_letter, pairs):
letter = first_letter

result = ''
result += letter

while True:
# print(letter)
# print('\t'.join([num_to_char[i] for i in range(len(pairs))]))
# print('\t'.join(map(str, pairs[char_to_num[letter]])))
letter = predict_next_letter_multinomial(letter, pairs)
if letter == '<.>':
break
result += letter
# print()

return result

generate_next_letters('S', prob_tensor)

Trigram

Our bigram model is not enough to capture the context of the words. How about if we add more context by using trigram model?

triplets = [[[0 for _ in range(len(vocabs))] for _ in range(len(vocabs))] for _ in range(len(vocabs))]

for name in names:
chars = list(name)
# add <start> token and double <end> tokens to form valid ending trigrams
chars = ['<.>'] + chars + ['<.>', '<.>']

for i in range(len(chars)-2): # Adjust loop to ensure we have triplets
# abc
a = chars[i]
b = chars[i+1]
c = chars[i+2]
triplets[char_to_num[a]][char_to_num[b]][char_to_num[c]] += 1


draw_pair_matrix(triplets[char_to_num['S']])
import torch
triplets_tensor = torch.tensor(triplets, dtype=torch.float)
next_probs = triplets_tensor[char_to_num['S']].sum(dim=1)/triplets_tensor[char_to_num['S']].sum()
next_probs
def predict_next_letter_trigram(letter1, letter2, triplets_tensor):
# calculate the probability
idx1 = char_to_num[letter1]
idx2 = char_to_num[letter2]
next_probs = triplets_tensor[idx1][idx2]/triplets_tensor[idx1][idx2].sum()
sampling = torch.multinomial(next_probs, 1, replacement=True)
return num_to_char[sampling.item()]

predict_next_letter_trigram('S', 'U', triplets_tensor)
def generate_next_letters_trigram(first_letter, second_letter, triplets):
letter1 = first_letter
letter2 = second_letter

result = ''
result += letter1
result += letter2

while True:
# print(letter)
# print('\t'.join([num_to_char[i] for i in range(len(pairs))]))
# print('\t'.join(map(str, pairs[char_to_num[letter]])))
letter = predict_next_letter_trigram(letter1, letter2, triplets)
if letter == '<.>':
break

letter1 = letter2
letter2 = letter

result += letter
# print()

return result
generate_next_letters_trigram('S', 'I', triplets_tensor)

Quality Assessment

How do we know whether the generated text is good or not?

prob_tensor[char_to_num['S']]
# Max probability
prob_tensor[char_to_num['S']].max()
# Probability distribution, plot it
import matplotlib.pyplot as plt

plt.plot(prob_tensor[char_to_num['S']])
plt.show()


# to get the prediciton, take the argmax
prediction = prob_tensor[char_to_num['S']].argmax()
num_to_char[prediction.item()]

The best probability is 1. And the worst is 0.

# draw log plot from x = 0 to 1
import numpy as np

x = np.linspace(0, 1, 100)
y = np.log(x)
plt.plot(x, y)
plt.show()

What we are intereseted in is the loss function.

# draw -log plot from x = 0 to 1
x = np.linspace(0, 1, 100)
y = -np.log(x)
plt.plot(x, y)
plt.show()


# Probability distribution, plot it
import matplotlib.pyplot as plt

nll = -torch.log(prob_tensor[char_to_num['S']])
plt.plot(nll)
plt.show()
letters = "SUSI"
nll_pred_bigram = 0
nll_my_letter = 0

for i in range(len(letters)-1):
print(letters[i])

cur_prob = prob_tensor[char_to_num[letters[i]]]
prediction = cur_prob.argmax().item()
print("The prediction is", num_to_char[prediction], " with probability", cur_prob[prediction])
print("The expected is", letters[i+1], " with probability", cur_prob[char_to_num[letters[i+1]]])

nll_pred_bigram += -torch.log(cur_prob[prediction])
nll_my_letter += -torch.log(cur_prob[char_to_num[letters[i+1]]])
print()

print("-----")
print("The NLL for prediction is", nll_pred_bigram)
print("The NLL for my letter is", nll_my_letter)



Embedding & Neural Network

names
xs = []
ys = []

for name in names:
chars = list(name)
# add <start> and <end> token
chars = ['<.>'] + chars + ['<.>']

for i in range(len(chars)-1):
# ab
xs.append(chars[i])
ys.append(chars[i+1])

list(zip(xs, ys))

# but we want xs and ys to be numbers
xs = [char_to_num[x] for x in xs]
ys = [char_to_num[y] for y in ys]

list(zip(xs, ys))
# convert into tensor
xs = torch.tensor(xs)
ys = torch.tensor(ys)
import torch.nn as nn

embed = nn.Embedding(len(vocabs), len(vocabs))
# print the embedding
print(embed.weight)

# Wait, isn't it similar to our bigram pairs?
# get the first row in the embed
embed(torch.tensor([0]))

# the second row
embed(torch.tensor([1]))
epoch = 100

embed = nn.Embedding(len(vocabs), len(vocabs))
optimizer = torch.optim.Adam(embed.parameters(), lr=0.1)

for _ in range(epoch):
# zero the gradients
optimizer.zero_grad()

# get the prediction
counts = embed(xs).exp()
probs = counts / counts.sum(dim=1, keepdim=True)
loss = -torch.log(probs[range(len(ys)), ys]).mean()

# backprop
loss.backward()

# update the weights
optimizer.step()

print(loss.item())




idx = torch.tensor([char_to_num['S']])
counts = embed(idx).exp()
probs = counts / counts.sum(dim=1, keepdim=True)

# plot the probability
plt.plot(probs[0].detach().numpy())
plt.show()

# take the max probability
prediction = probs.argmax().item()
num_to_char[prediction]

def predict_next_letter_nn(letter, embed):
idx = torch.tensor([char_to_num[letter]])
counts = embed(idx).exp()
probs = counts / counts.sum(dim=1, keepdim=True)

return num_to_char[probs.argmax().item()]

predict_next_letter_nn('S', embed)
predict_next_letter_nn('U', embed)

predict_next_letter_nn('R', embed)