In this documentation, weβll explore how to enhance a modelβs performance within a specific domain. Here, Iβve focused on improving the physics-solving abilities of LLMs by applying targeted techniques.
Important Note:
Since I am using a pretrained LLM to improve its performance on perticular task I have to teach it, train it
exactly how we learn get trained when learning new things, so to this we need a dataset which consists of domain
specific data, not like teaching mathematics and asking a chemistry reaction result.
Try to find data related to your domain online. If you didn't get any dataset related to you then try to curate
your own dataset like how I curated a physics problems dataset for this project. We use this dataset to test,
train and evaluate the performance of models before finetuning and after finetuning using different techniques.
mail me if you want the dataset used in this documentation
from google.colab import drive
drive.mount('/content/drive')
Mount your Drive... because downloading the universe is still not allowed π
Dividing dataset into three parts test, train, eval in right proportions is important to compare
the performance of the LLM's before and after fintuning.
Since I am using physics domain I have dataset containg physics problems of different topics like Kinematics,
Electromagnetics etc., and I applied stratified technique to split the dataset into test, train and eval so all
the topics are evenly divided.
import json
import random
from sklearn.model_selection import train_test_split
from collections import Counter
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
with open(r"/content/drive/MyDrive/dataset/high_school_physics.json", "r", encoding="utf-8") as f:
data = json.load(f)
subjects = [item["subject"] for item in data]
print("Original subject distribution:", Counter(subjects))
train_data, temp_data = train_test_split(
data,
train_size=0.7,
stratify=subjects,
random_state=RANDOM_SEED
)
test_data, eval_data = train_test_split(
temp_data,
test_size=0.5,
stratify=[item["subject"] for item in temp_data],
random_state=RANDOM_SEED
)
with open(r"/content/drive/MyDrive/dataset/train.json", "w", encoding="utf-8") as f:
json.dump(train_data, f, indent=4)
with open(r"/content/drive/MyDrive/dataset/test.json", "w", encoding="utf-8") as f:
json.dump(test_data, f, indent=4)
with open(r"/content/drive/MyDrive/dataset/eval.json", "w", encoding="utf-8") as f:
json.dump(eval_data, f, indent=4)
print(f"\nDataset split into {len(train_data)} training, {len(test_data)} testing, and {len(eval_data)} evaluation samples.")
print("\nSubject distribution in each split:")
print("Train:", Counter([item["subject"] for item in train_data]))
print("Test:", Counter([item["subject"] for item in test_data]))
print("Eval:", Counter([item["subject"] for item in eval_data]))
Statistically balanced data splitting β like cutting cake evenly at a birthday party π
And now the fun begins. We evaluate flan-t5-base, Phi-3-mini, gpt-2, gpt-neo models
in a zero-shot setting. That means the model's going into battle with no prior training from us β like a gladiator
with no sword π‘οΈ. Letβs see how well it fares!
Here is the base implementation for flan-t5-base model
from transformers import pipeline
qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
def evaluate_model(model, dataset):
correct = 0
total = len(dataset)
for item in dataset:
question = item["question"]
idx = ord(item["answer"])-ord("A")
correct_answer = item["choices"][idx]
prompt = f"Give me the final answer without any explanation, just the couple of words with units that directly show the answer for the Question: {question} with Choices: {', '.join(item['choices'])} Answer:"
prediction = model(prompt, max_length=20, truncation=True)[0]["generated_text"]
if correct_answer in prediction:
correct += 1
accuracy = (correct / total) * 100
return accuracy
zero_shot_accuracy = evaluate_model(qa_pipeline, test_data)
print(f"Zero-Shot Accuracy: {zero_shot_accuracy:.2f}%")
Here I implemented Supervised finetuning technique to train the model on the train dataset, we can
finetune using different Reinforcemnt Learning Algorithms and compare which techinques best suits to
model.
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import json
import os
import time
import torch
os.environ["WANDB_DISABLED"] = "true"
device = torch.device("cpu")
model_name = "google/flan-t5-base"
model_path = "/content/drive/MyDrive/dataset/trained_model"
if os.path.exists(model_path):
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_path, legacy=False)
else:
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
class PhysicsDataset(Dataset):
def __init__(self, data, tokenizer, max_length=128):
self.data = data
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
prompt = f"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:"
idx = ord(item["answer"][0]) - ord("A")
target = item["choices"][idx]
encodings = self.tokenizer(prompt, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
target_encodings = self.tokenizer(target, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
return {
"input_ids": encodings["input_ids"].squeeze(),
"attention_mask": encodings["attention_mask"].squeeze(),
"labels": target_encodings["input_ids"].squeeze(),
}
with open("/content/drive/MyDrive/dataset/train.json", "r") as f:
train_data = json.load(f)
with open("/content/drive/MyDrive/dataset/eval.json", "r") as f:
eval_data = json.load(f)
with open("/content/drive/MyDrive/dataset/test.json", "r") as f:
test_data = json.load(f)
train_dataset = PhysicsDataset(train_data, tokenizer)
eval_dataset = PhysicsDataset(eval_data, tokenizer)
model_save_dir = "/content/drive/MyDrive/dataset/trained_model"
results_dir = "/content/drive/MyDrive/dataset/results"
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)
training_args = TrainingArguments(
output_dir=results_dir,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
num_train_epochs=3,
save_steps=70,
eval_strategy="epoch",
logging_dir="./logs",
run_name=f"flan-t5-finetune-{time.strftime('%Y%m%d-%H%M%S')}",
report_to="none",
learning_rate=1e-5,
gradient_accumulation_steps=4,
fp16=False,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
for epoch in range(int(training_args.num_train_epochs)):
trainer.train()
model.save_pretrained(f"{model_save_dir}_epoch_{epoch + 1}")
tokenizer.save_pretrained(f"{model_save_dir}_epoch_{epoch + 1}")
model.save_pretrained(model_save_dir)
tokenizer.save_pretrained(model_save_dir)
correct = 0
predictions = []
for item in test_data:
prompt = f"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=10)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
idx = ord(item["answer"][0]) - ord("A")
target = item["choices"][idx]
predictions.append({"question": item["question"], "predicted": prediction, "target": target})
if prediction == target:
correct += 1
accuracy = correct / len(test_data) * 100
with open("/content/drive/MyDrive/dataset/finetune_test_results.json", "w") as f:
json.dump({"accuracy": accuracy, "predictions": predictions}, f, indent=4)