Loading

Programming Language Classification

Roberta Pytorch with W&B Integration[TRAINING ONLY]

A end to end training notebook to get started with Pytorch State of the Art Transformers

jinoooooooooo

Imports and installs

In [ ]:
!pip install --upgrade wandb &> /dev/null
!pip install transformers &> /dev/null
In [ ]:
import os
import gc
import copy
import time
import random
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
import wandb
wandb.login(key = 'enter your key here')
In [ ]:
def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
    '''generates a random 12 digit'''
    return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

HASH_NAME = id_generator(size=12)
print(HASH_NAME)

Configs

In [ ]:
CONFIG = {"seed": 2021,
          "epochs": 10,
          "criterion": nn.CrossEntropyLoss(),
          "model_name": "roberta-base",
          "train_batch_size": 32,
          "valid_batch_size": 64,
          "max_length": 128,
          "learning_rate": 1e-4,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 5,
          "n_accumulate": 1,
          "num_classes": 15,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "hash_name": HASH_NAME
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'
In [ ]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

Data (you can either download them or use the aicrowd cli)

In [ ]:
train_df = pd.read_csv('../input/train-lang-class/blitz_lang_class.csv')
train_df.head(2)
In [ ]:
plt.figure(figsize = (14, 7))
sns.countplot(train_df.language)

Creating folds

In [ ]:
skf = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])
for fold, (train, val) in enumerate(skf.split(X = train_df, y = train_df.language)):
  train_df.loc[val, 'kfold'] = int(fold)

train_df.sample(4)
In [ ]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder().fit(train_df.language)

train_df['target'] = LE.transform(train_df.language)

train_df.sample(3)

Dataset

In [ ]:
class Blitz(Dataset):
  def __init__(self, df, tokenizer, max_length):
    self.df = df
    self.tokenizer = tokenizer
    self.max_len = max_length
    self.code = df['code'].values
    self.target = df['target'].values
  
  def __len__(self):
    return len(self.df)
  
  def __getitem__(self,index):
    code = self.code[index]
    target = self.target[index]
    inputs_code = self.tokenizer.encode_plus(
                  code,
                  truncation = True,
                  add_special_tokens = True,
                  max_length = self.max_len,
                  padding = 'max_length'
                  ) 

    code_ids = inputs_code['input_ids']
    code_mask = inputs_code['attention_mask']

    return {
            'code_ids': torch.tensor(code_ids, dtype=torch.long),
            'code_mask': torch.tensor(code_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

Testing

In [ ]:
sample_ds = Blitz(train_df, tokenizer = CONFIG['tokenizer'], max_length = CONFIG['max_length'])
print(len(sample_ds))
In [ ]:
next(iter(sample_ds))

Creating model

In [ ]:
class BlitzModel(nn.Module):
    def __init__(self, model_name):
        super(BlitzModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p = 0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids = ids,attention_mask = mask,
                         output_hidden_states = False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

Training

In [ ]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        code_ids = data['code_ids'].to(device, dtype = torch.long)
        code_mask = data['code_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = code_ids.size(0)

        code_outputs = model(code_ids, code_mask)
        
        crit = CONFIG['criterion']
        loss = crit(code_outputs, targets)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

Validation

In [ ]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        code_ids = data['code_ids'].to(device, dtype = torch.long)
        code_mask = data['code_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = code_ids.size(0)

        code_outputs = model(code_ids, code_mask)
        
        crit = CONFIG['criterion']
        loss = crit(code_outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss

Run training

In [ ]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history
In [ ]:
def prepare_loaders(fold):
    df_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    df_valid = train_df[train_df.kfold == fold].reset_index(drop=True)
    
    train_dataset = Blitz(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_dataset = Blitz(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader
In [ ]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler
In [ ]:
for fold in range(0, CONFIG['n_fold']):
    print(f"====== Fold: {fold} ======")
    run = wandb.init(project = 'Blitz_lang_class', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
                     name=f'{HASH_NAME}-fold-{fold}',
                     anonymous='must')
    
    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(fold=fold)
    
    model = BlitzModel(CONFIG['model_name'])
    model.to(CONFIG['device'])
    
    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)
    
    model, history = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold)
    
    run.finish()
    
    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()

Comments

jinoooooooooo
Over 2 years ago

Do checkout the inference notebook too ^^

You must login before you can post a comment.

Execute