Welcome to PhysiSolve Docs πŸ‘‹

In this documentation, we’ll explore how to enhance a model’s performance within a specific domain. Here, I’ve focused on improving the physics-solving abilities of LLMs by applying targeted techniques.

Important Note:


πŸ” Step 1: Load and Prepare the Dataset

Since I am using a pretrained LLM to improve its performance on perticular task I have to teach it, train it exactly how we learn get trained when learning new things, so to this we need a dataset which consists of domain specific data, not like teaching mathematics and asking a chemistry reaction result.
Try to find data related to your domain online. If you didn't get any dataset related to you then try to curate your own dataset like how I curated a physics problems dataset for this project. We use this dataset to test, train and evaluate the performance of models before finetuning and after finetuning using different techniques.
mail me if you want the dataset used in this documentation

I have used Google colab for my project for their free 12.7GB cpu and unlimited space to store my trained models using my school drive account πŸ˜‰
from google.colab import drive
drive.mount('/content/drive')

Mount your Drive... because downloading the universe is still not allowed πŸš€

🎲 Step 2: Load Dataset & Split Data Smartly

Dividing dataset into three parts test, train, eval in right proportions is important to compare the performance of the LLM's before and after fintuning.
Since I am using physics domain I have dataset containg physics problems of different topics like Kinematics, Electromagnetics etc., and I applied stratified technique to split the dataset into test, train and eval so all the topics are evenly divided.

import json
import random
from sklearn.model_selection import train_test_split
from collections import Counter

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

with open(r"/content/drive/MyDrive/dataset/high_school_physics.json", "r", encoding="utf-8") as f:
    data = json.load(f)

subjects = [item["subject"] for item in data]
print("Original subject distribution:", Counter(subjects))

train_data, temp_data = train_test_split(
    data,
    train_size=0.7,
    stratify=subjects,
    random_state=RANDOM_SEED
)

test_data, eval_data = train_test_split(
    temp_data,
    test_size=0.5,
    stratify=[item["subject"] for item in temp_data],
    random_state=RANDOM_SEED
)

with open(r"/content/drive/MyDrive/dataset/train.json", "w", encoding="utf-8") as f:
json.dump(train_data, f, indent=4)
with open(r"/content/drive/MyDrive/dataset/test.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=4)
    with open(r"/content/drive/MyDrive/dataset/eval.json", "w", encoding="utf-8") as f:
    json.dump(eval_data, f, indent=4)
    
    print(f"\nDataset split into {len(train_data)} training, {len(test_data)} testing, and {len(eval_data)} evaluation samples.")
    print("\nSubject distribution in each split:")
    print("Train:", Counter([item["subject"] for item in train_data]))
    print("Test:", Counter([item["subject"] for item in test_data]))
    print("Eval:", Counter([item["subject"] for item in eval_data]))

Statistically balanced data splitting – like cutting cake evenly at a birthday party πŸŽ‚

πŸ€– Model Evaluation (Zero-shot)

And now the fun begins. We evaluate flan-t5-base, Phi-3-mini, gpt-2, gpt-neo models in a zero-shot setting. That means the model's going into battle with no prior training from us β€” like a gladiator with no sword πŸ—‘οΈ. Let’s see how well it fares!
Here is the base implementation for flan-t5-base model


from transformers import pipeline

qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def evaluate_model(model, dataset):
    correct = 0
    total = len(dataset)

    for item in dataset:
        question = item["question"]
        idx = ord(item["answer"])-ord("A")
        correct_answer = item["choices"][idx]

        prompt = f"Give me the final answer without any explanation, just the couple of words with units that directly show the answer for the Question: {question} with Choices: {', '.join(item['choices'])} Answer:"
        prediction = model(prompt, max_length=20, truncation=True)[0]["generated_text"]

        if correct_answer in prediction:
            correct += 1

    accuracy = (correct / total) * 100
    return accuracy

zero_shot_accuracy = evaluate_model(qa_pipeline, test_data)
print(f"Zero-Shot Accuracy: {zero_shot_accuracy:.2f}%")
            

πŸ‘¨β€πŸ« Supervised Finetuning the model on train dataset

Here I implemented Supervised finetuning technique to train the model on the train dataset, we can finetune using different Reinforcemnt Learning Algorithms and compare which techinques best suits to model.

      
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import json
import os
import time
import torch

os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cpu")

model_name = "google/flan-t5-base"
model_path = "/content/drive/MyDrive/dataset/trained_model"
if os.path.exists(model_path):
    model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
    tokenizer = T5Tokenizer.from_pretrained(model_path, legacy=False)
else:
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
    
    class PhysicsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
    self.data = data
    self.tokenizer = tokenizer
    self.max_length = max_length
    
    def __len__(self):
    return len(self.data)
    
    def __getitem__(self, idx):
    item = self.data[idx]
    prompt = f"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:"
    idx = ord(item["answer"][0]) - ord("A")
    target = item["choices"][idx]
    encodings = self.tokenizer(prompt, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
    target_encodings = self.tokenizer(target, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
    return {
      "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze(),
          }

with open("/content/drive/MyDrive/dataset/train.json", "r") as f:
train_data = json.load(f)
with open("/content/drive/MyDrive/dataset/eval.json", "r") as f:
    eval_data = json.load(f)
    with open("/content/drive/MyDrive/dataset/test.json", "r") as f:
    test_data = json.load(f)

train_dataset = PhysicsDataset(train_data, tokenizer)
eval_dataset = PhysicsDataset(eval_data, tokenizer)

model_save_dir = "/content/drive/MyDrive/dataset/trained_model"
results_dir = "/content/drive/MyDrive/dataset/results"
os.makedirs(model_save_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=results_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    save_steps=70,
    eval_strategy="epoch",
    logging_dir="./logs",
    run_name=f"flan-t5-finetune-{time.strftime('%Y%m%d-%H%M%S')}",
    report_to="none",
    learning_rate=1e-5,
    gradient_accumulation_steps=4,
    fp16=False,
    save_total_limit=2,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    )
    
for epoch in range(int(training_args.num_train_epochs)):
trainer.train()
model.save_pretrained(f"{model_save_dir}_epoch_{epoch + 1}")
tokenizer.save_pretrained(f"{model_save_dir}_epoch_{epoch + 1}")

model.save_pretrained(model_save_dir)
tokenizer.save_pretrained(model_save_dir)

correct = 0
predictions = []
for item in test_data:
prompt = f"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=10)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
idx = ord(item["answer"][0]) - ord("A")
target = item["choices"][idx]
predictions.append({"question": item["question"], "predicted": prediction, "target": target})
if prediction == target:
correct += 1

accuracy = correct / len(test_data) * 100
with open("/content/drive/MyDrive/dataset/finetune_test_results.json", "w") as f:
json.dump({"accuracy": accuracy, "predictions": predictions}, f, indent=4)