Loading

Obstacle Prediction

Solution for submission 154537

A detailed solution for submission 154537 submitted for challenge Obstacle Prediction

BanKhv
In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import gc
import os
from glob import glob
import pickle
import random
import shutil
import seaborn as sns
from collections import Counter

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score, train_test_split
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score, matthews_corrcoef
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

#import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.simplefilter(action              = 'ignore', category = FutureWarning)
warnings.simplefilter(action              = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action              = 'ignore', category = UserWarning)
warnings.simplefilter(action              = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None
In [2]:
dtrain = pd.read_csv('../input/train.csv')
dtrain.head()
Out[2]:
id title abstract judgement
0 0 One-year age changes in MRI brain volumes in o... Longitudinal studies indicate that declines in... 0
1 1 Supportive CSF biomarker evidence to enhance t... The present study was undertaken to validate t... 0
2 2 Occurrence of basal ganglia germ cell tumors w... Objective: To report a case series in which ba... 0
3 3 New developments in diagnosis and therapy of C... The etiology and pathogenesis of idiopathic ch... 0
4 4 Prolonged shedding of SARS-CoV-2 in an elderly... NaN 0
In [3]:
dtrain.judgement.unique()
Out[3]:
array([0, 1])
In [4]:
target = 'judgement'
In [5]:
print(dtrain.shape)

dtrain = dtrain[dtrain[target] == dtrain[target]]

print(dtrain.shape)
(27145, 4)
(27145, 4)
In [6]:
dtrain['title'].fillna('None', inplace = True)
dtrain['abstract'].fillna('None', inplace = True)
In [7]:
dtest = pd.read_csv('../input/test.csv')
dtest.head(3)
Out[7]:
id title abstract
0 27145 Estimating the potential effects of COVID-19 p... The objective of the paper is to analyse chang...
1 27146 Leukoerythroblastic reaction in a patient with... NaN
2 27147 [15O]-water PET and intraoperative brain mappi... [15O]-water PET was performed on 12 patients w...
In [8]:
dtest['title'].fillna('None', inplace = True)
dtest['abstract'].fillna('None', inplace = True)
In [9]:
from catboost import Pool, CatBoostClassifier

model = CatBoostClassifier(
    #cat_features  = cat_features,
    text_features = ['title', 'abstract'],
    verbose = 10,
    #loss_function = 'MultiClass',
    eval_metric    = 'F1',
    task_type      = "GPU",
    iterations     = 1000,
    learning_rate  = 0.2, 
    use_best_model = True,
    #reg_lambda=0.0001,
    
    text_processing = {
        "tokenizers" : [{
            "tokenizer_id" : "Space",
            "separator_type" : "ByDelimiter",
            "delimiter" : " "
        }],

        "dictionaries" : [{
            "dictionary_id" : "BiGram",
            "token_level_type": "Letter",
            "max_dictionary_size" : "150000",
            "occurrence_lower_bound" : "1",
            "gram_order" : "2"
        },{
            "dictionary_id" : "Trigram",
            "max_dictionary_size" : "150000",
            "token_level_type": "Letter",
            "occurrence_lower_bound" : "1",
            "gram_order" : "3"
        },{
            "dictionary_id" : "Fourgram",
            "max_dictionary_size" : "150000",
            "token_level_type": "Letter",
            "occurrence_lower_bound" : "1",
            "gram_order" : "4"
        },{
            "dictionary_id" : "Fivegram",
            "max_dictionary_size" : "150000",
            "token_level_type": "Letter",
            "occurrence_lower_bound" : "1",
            "gram_order" : "5"
        },{
            "dictionary_id" : "Sixgram",
            "max_dictionary_size" : "150000",
            "token_level_type": "Letter",
            "occurrence_lower_bound" : "1",
            "gram_order" : "6"
        }
        ],

        "feature_processing" : {
            "default" : [
                    {
                    "dictionaries_names" : [
                        "BiGram", 
                        "Trigram", 
                        "Fourgram", 
                        #"Fivegram", 
                        #"Sixgram"
                    ],
                    "feature_calcers" : ["BoW"],
                    "tokenizers_names" : ["Space"]
                },
                    {
                "dictionaries_names" : [
                    "BiGram", 
                    "Trigram", 
                    "Fourgram", 
                    #"Fivegram", 
                    #"Sixgram"
                ],
                "feature_calcers" : ["NaiveBayes"],
                "tokenizers_names" : ["Space"]
            },{
                "dictionaries_names" : [
                    "BiGram", 
                    "Trigram", 
                    "Fourgram", 
                    #"Fivegram", 
                    #"Sixgram"
                ],
                "feature_calcers" : ["BM25"],
                "tokenizers_names" : ["Space"]
            },
            ],
        }
    }
)
In [10]:
from sklearn.metrics import f1_score
from sklearn import model_selection

train, valid = model_selection.train_test_split(
    dtrain,
    test_size    = 0.10, 
    stratify     = dtrain[target], 
    shuffle      = True, 
    random_state = 10
)

# 10
In [ ]:
model.fit(
    train[['title', 'abstract']], train[[target]],
    eval_set = (valid[['title', 'abstract']], valid[[target]]),
    #plot=True
)
In [ ]:
from sklearn.metrics import classification_report

print(classification_report(valid[target].values, model.predict(valid[['title', 'abstract']])))
In [ ]:

In [ ]:
preds = model.predict(dtest[['title', 'abstract']])
dtest[target] = preds

dtest[['id', target]].head()
In [ ]:
dtest[['id', target]].to_csv('02.csv', index = False, header = False)
In [ ]:

In [ ]:


Comments

You must login before you can post a comment.

Execute