GPT from Scratch

gpt from scratch notes

# imports

import warnings 
import os 
from typing import List

warnings.filterwarnings("ignore")
# read dataset 

with open("./inputs.txt", "r", encoding = "utf-8") as f:
    text = f.read()
print(len(text))
1115394
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("".join(chars))
print(vocab_size)
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
# tokenize the characters

# character level language model 

def encode(text: str) -> List[int]:
    lookup_table = {letter: idx for idx, letter in enumerate(chars, start = 1)}
    rv = []
    for char in text:
        buffer = lookup_table[char]
        rv.append(buffer)

    return rv



def decode(token: List[int]) -> str:
    lookup_table = {idx: letter for idx, letter in enumerate(chars, start = 1)}
    rv = ""
    for num in token:
        buffer = lookup_table[num]
        rv = rv + buffer

    return rv


original_txt = "hi there"
encoded_token = encode("hi there")

decoded_txt = decode(encoded_token)

assert original_txt == decoded_txt, "Error in Encoder - Deocder"