Loading

Lidar Car Detection

Solution for submission 155379

A detailed solution for submission 155379 submitted for challenge Lidar Car Detection

BanKhv
In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import gc
import os
from glob import glob
import pickle
import random
import shutil
import seaborn as sns
from collections import Counter

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score, train_test_split
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score, matthews_corrcoef
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

#import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.simplefilter(action              = 'ignore', category = FutureWarning)
warnings.simplefilter(action              = 'ignore', category = DeprecationWarning)
warnings.simplefilter(action              = 'ignore', category = UserWarning)
warnings.simplefilter(action              = 'ignore', category = RuntimeWarning)
warnings.filterwarnings("ignore", message = "numpy.dtype size changed")
warnings.filterwarnings("ignore", message = "numpy.ufunc size changed")
pd.options.mode.chained_assignment = None
%load_ext aicrowd.magic %aicrowd login!rm -rf data !mkdir data %aicrowd ds dl -c lidar-car-detection -o data
In [ ]:

In [2]:
train_data = np.load("./data/train.npz", allow_pickle=True)
train_data = train_data['train']

train_data.shape
Out[2]:
(400, 2)
In [3]:
X = train_data[:100, 0]
dtrain = [i.flatten()  for i in X]
target = train_data[:100, 1]
In [4]:
test_data = np.load("./data/test.npz", allow_pickle=True)
test_data = test_data['test']

test_data.shape
Out[4]:
(601,)
In [5]:
dtest = [i.flatten()  for i in test_data]
In [6]:
dtrain = np.array(dtrain)
dtest = np.array(dtest)
In [7]:
use = ['x_' + str(f) for f in range(dtrain.shape[1])]

dtrain = pd.DataFrame(dtrain, columns = use)
dtrain['target'] = target

dtest = pd.DataFrame(dtest, columns = use)
In [8]:
dtrain.shape, dtest.shape
Out[8]:
((100, 116380), (601, 116379))
In [9]:
dtrain.head()
Out[9]:
x_0 x_1 x_2 x_3 x_4 x_5 x_6 x_7 x_8 x_9 ... x_116370 x_116371 x_116372 x_116373 x_116374 x_116375 x_116376 x_116377 x_116378 target
0 -12.568300 -5.545469 7.304270 -2.542295 16.209188 8.723945 7.204814 -5.573594 4.843357 15.317773 ... -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 1
1 1.809375 -5.568750 3.113327 1.661836 -5.567539 3.089362 1.516445 -5.566328 3.067530 1.373027 ... -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 0
2 -14.791308 -10.746484 9.721252 -15.049731 -10.389766 9.723750 -15.299448 -10.026738 9.726181 -15.540278 ... -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 1
3 10.573271 -12.161767 8.568655 7.393847 -8.930312 6.164613 6.548662 -12.020322 7.278258 -5.766494 ... -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 4
4 8.062422 14.520190 8.830832 -1.427363 -5.454453 2.997852 -1.573184 -5.470215 3.026464 -3.972129 ... -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 1

5 rows × 116380 columns

In [10]:
# remove constant columns
remove = []
for c in dtrain.columns:
    if dtrain[c].std() == 0:
        remove.append(c)

print('remove constant columns', remove)
dtrain.drop(remove, axis = 1, inplace = True)
dtest.drop(remove,  axis = 1, inplace = True)

use = list(dtest.columns)
remove constant columns ['x_115641', 'x_115642', 'x_115643', 'x_115644', 'x_115645', 'x_115646', 'x_115647', 'x_115648', 'x_115649', 'x_115650', 'x_115651', 'x_115652', 'x_115653', 'x_115654', 'x_115655', 'x_115656', 'x_115657', 'x_115658', 'x_115659', 'x_115660', 'x_115661', 'x_115662', 'x_115663', 'x_115664', 'x_115665', 'x_115666', 'x_115667', 'x_115668', 'x_115669', 'x_115670', 'x_115671', 'x_115672', 'x_115673', 'x_115674', 'x_115675', 'x_115676', 'x_115677', 'x_115678', 'x_115679', 'x_115680', 'x_115681', 'x_115682', 'x_115683', 'x_115684', 'x_115685', 'x_115686', 'x_115687', 'x_115688', 'x_115689', 'x_115690', 'x_115691', 'x_115692', 'x_115693', 'x_115694', 'x_115695', 'x_115696', 'x_115697', 'x_115698', 'x_115699', 'x_115700', 'x_115701', 'x_115702', 'x_115703', 'x_115704', 'x_115705', 'x_115706', 'x_115707', 'x_115708', 'x_115709', 'x_115710', 'x_115711', 'x_115712', 'x_115713', 'x_115714', 'x_115715', 'x_115716', 'x_115717', 'x_115718', 'x_115719', 'x_115720', 'x_115721', 'x_115722', 'x_115723', 'x_115724', 'x_115725', 'x_115726', 'x_115727', 'x_115728', 'x_115729', 'x_115730', 'x_115731', 'x_115732', 'x_115733', 'x_115734', 'x_115735', 'x_115736', 'x_115737', 'x_115738', 'x_115739', 'x_115740', 'x_115741', 'x_115742', 'x_115743', 'x_115744', 'x_115745', 'x_115746', 'x_115747', 'x_115748', 'x_115749', 'x_115750', 'x_115751', 'x_115752', 'x_115753', 'x_115754', 'x_115755', 'x_115756', 'x_115757', 'x_115758', 'x_115759', 'x_115760', 'x_115761', 'x_115762', 'x_115763', 'x_115764', 'x_115765', 'x_115766', 'x_115767', 'x_115768', 'x_115769', 'x_115770', 'x_115771', 'x_115772', 'x_115773', 'x_115774', 'x_115775', 'x_115776', 'x_115777', 'x_115778', 'x_115779', 'x_115780', 'x_115781', 'x_115782', 'x_115783', 'x_115784', 'x_115785', 'x_115786', 'x_115787', 'x_115788', 'x_115789', 'x_115790', 'x_115791', 'x_115792', 'x_115793', 'x_115794', 'x_115795', 'x_115796', 'x_115797', 'x_115798', 'x_115799', 'x_115800', 'x_115801', 'x_115802', 'x_115803', 'x_115804', 'x_115805', 'x_115806', 'x_115807', 'x_115808', 'x_115809', 'x_115810', 'x_115811', 'x_115812', 'x_115813', 'x_115814', 'x_115815', 'x_115816', 'x_115817', 'x_115818', 'x_115819', 'x_115820', 'x_115821', 'x_115822', 'x_115823', 'x_115824', 'x_115825', 'x_115826', 'x_115827', 'x_115828', 'x_115829', 'x_115830', 'x_115831', 'x_115832', 'x_115833', 'x_115834', 'x_115835', 'x_115836', 'x_115837', 'x_115838', 'x_115839', 'x_115840', 'x_115841', 'x_115842', 'x_115843', 'x_115844', 'x_115845', 'x_115846', 'x_115847', 'x_115848', 'x_115849', 'x_115850', 'x_115851', 'x_115852', 'x_115853', 'x_115854', 'x_115855', 'x_115856', 'x_115857', 'x_115858', 'x_115859', 'x_115860', 'x_115861', 'x_115862', 'x_115863', 'x_115864', 'x_115865', 'x_115866', 'x_115867', 'x_115868', 'x_115869', 'x_115870', 'x_115871', 'x_115872', 'x_115873', 'x_115874', 'x_115875', 'x_115876', 'x_115877', 'x_115878', 'x_115879', 'x_115880', 'x_115881', 'x_115882', 'x_115883', 'x_115884', 'x_115885', 'x_115886', 'x_115887', 'x_115888', 'x_115889', 'x_115890', 'x_115891', 'x_115892', 'x_115893', 'x_115894', 'x_115895', 'x_115896', 'x_115897', 'x_115898', 'x_115899', 'x_115900', 'x_115901', 'x_115902', 'x_115903', 'x_115904', 'x_115905', 'x_115906', 'x_115907', 'x_115908', 'x_115909', 'x_115910', 'x_115911', 'x_115912', 'x_115913', 'x_115914', 'x_115915', 'x_115916', 'x_115917', 'x_115918', 'x_115919', 'x_115920', 'x_115921', 'x_115922', 'x_115923', 'x_115924', 'x_115925', 'x_115926', 'x_115927', 'x_115928', 'x_115929', 'x_115930', 'x_115931', 'x_115932', 'x_115933', 'x_115934', 'x_115935', 'x_115936', 'x_115937', 'x_115938', 'x_115939', 'x_115940', 'x_115941', 'x_115942', 'x_115943', 'x_115944', 'x_115945', 'x_115946', 'x_115947', 'x_115948', 'x_115949', 'x_115950', 'x_115951', 'x_115952', 'x_115953', 'x_115954', 'x_115955', 'x_115956', 'x_115957', 'x_115958', 'x_115959', 'x_115960', 'x_115961', 'x_115962', 'x_115963', 'x_115964', 'x_115965', 'x_115966', 'x_115967', 'x_115968', 'x_115969', 'x_115970', 'x_115971', 'x_115972', 'x_115973', 'x_115974', 'x_115975', 'x_115976', 'x_115977', 'x_115978', 'x_115979', 'x_115980', 'x_115981', 'x_115982', 'x_115983', 'x_115984', 'x_115985', 'x_115986', 'x_115987', 'x_115988', 'x_115989', 'x_115990', 'x_115991', 'x_115992', 'x_115993', 'x_115994', 'x_115995', 'x_115996', 'x_115997', 'x_115998', 'x_115999', 'x_116000', 'x_116001', 'x_116002', 'x_116003', 'x_116004', 'x_116005', 'x_116006', 'x_116007', 'x_116008', 'x_116009', 'x_116010', 'x_116011', 'x_116012', 'x_116013', 'x_116014', 'x_116015', 'x_116016', 'x_116017', 'x_116018', 'x_116019', 'x_116020', 'x_116021', 'x_116022', 'x_116023', 'x_116024', 'x_116025', 'x_116026', 'x_116027', 'x_116028', 'x_116029', 'x_116030', 'x_116031', 'x_116032', 'x_116033', 'x_116034', 'x_116035', 'x_116036', 'x_116037', 'x_116038', 'x_116039', 'x_116040', 'x_116041', 'x_116042', 'x_116043', 'x_116044', 'x_116045', 'x_116046', 'x_116047', 'x_116048', 'x_116049', 'x_116050', 'x_116051', 'x_116052', 'x_116053', 'x_116054', 'x_116055', 'x_116056', 'x_116057', 'x_116058', 'x_116059', 'x_116060', 'x_116061', 'x_116062', 'x_116063', 'x_116064', 'x_116065', 'x_116066', 'x_116067', 'x_116068', 'x_116069', 'x_116070', 'x_116071', 'x_116072', 'x_116073', 'x_116074', 'x_116075', 'x_116076', 'x_116077', 'x_116078', 'x_116079', 'x_116080', 'x_116081', 'x_116082', 'x_116083', 'x_116084', 'x_116085', 'x_116086', 'x_116087', 'x_116088', 'x_116089', 'x_116090', 'x_116091', 'x_116092', 'x_116093', 'x_116094', 'x_116095', 'x_116096', 'x_116097', 'x_116098', 'x_116099', 'x_116100', 'x_116101', 'x_116102', 'x_116103', 'x_116104', 'x_116105', 'x_116106', 'x_116107', 'x_116108', 'x_116109', 'x_116110', 'x_116111', 'x_116112', 'x_116113', 'x_116114', 'x_116115', 'x_116116', 'x_116117', 'x_116118', 'x_116119', 'x_116120', 'x_116121', 'x_116122', 'x_116123', 'x_116124', 'x_116125', 'x_116126', 'x_116127', 'x_116128', 'x_116129', 'x_116130', 'x_116131', 'x_116132', 'x_116133', 'x_116134', 'x_116135', 'x_116136', 'x_116137', 'x_116138', 'x_116139', 'x_116140', 'x_116141', 'x_116142', 'x_116143', 'x_116144', 'x_116145', 'x_116146', 'x_116147', 'x_116148', 'x_116149', 'x_116150', 'x_116151', 'x_116152', 'x_116153', 'x_116154', 'x_116155', 'x_116156', 'x_116157', 'x_116158', 'x_116159', 'x_116160', 'x_116161', 'x_116162', 'x_116163', 'x_116164', 'x_116165', 'x_116166', 'x_116167', 'x_116168', 'x_116169', 'x_116170', 'x_116171', 'x_116172', 'x_116173', 'x_116174', 'x_116175', 'x_116176', 'x_116177', 'x_116178', 'x_116179', 'x_116180', 'x_116181', 'x_116182', 'x_116183', 'x_116184', 'x_116185', 'x_116186', 'x_116187', 'x_116188', 'x_116189', 'x_116190', 'x_116191', 'x_116192', 'x_116193', 'x_116194', 'x_116195', 'x_116196', 'x_116197', 'x_116198', 'x_116199', 'x_116200', 'x_116201', 'x_116202', 'x_116203', 'x_116204', 'x_116205', 'x_116206', 'x_116207', 'x_116208', 'x_116209', 'x_116210', 'x_116211', 'x_116212', 'x_116213', 'x_116214', 'x_116215', 'x_116216', 'x_116217', 'x_116218', 'x_116219', 'x_116220', 'x_116221', 'x_116222', 'x_116223', 'x_116224', 'x_116225', 'x_116226', 'x_116227', 'x_116228', 'x_116229', 'x_116230', 'x_116231', 'x_116232', 'x_116233', 'x_116234', 'x_116235', 'x_116236', 'x_116237', 'x_116238', 'x_116239', 'x_116240', 'x_116241', 'x_116242', 'x_116243', 'x_116244', 'x_116245', 'x_116246', 'x_116247', 'x_116248', 'x_116249', 'x_116250', 'x_116251', 'x_116252', 'x_116253', 'x_116254', 'x_116255', 'x_116256', 'x_116257', 'x_116258', 'x_116259', 'x_116260', 'x_116261', 'x_116262', 'x_116263', 'x_116264', 'x_116265', 'x_116266', 'x_116267', 'x_116268', 'x_116269', 'x_116270', 'x_116271', 'x_116272', 'x_116273', 'x_116274', 'x_116275', 'x_116276', 'x_116277', 'x_116278', 'x_116279', 'x_116280', 'x_116281', 'x_116282', 'x_116283', 'x_116284', 'x_116285', 'x_116286', 'x_116287', 'x_116288', 'x_116289', 'x_116290', 'x_116291', 'x_116292', 'x_116293', 'x_116294', 'x_116295', 'x_116296', 'x_116297', 'x_116298', 'x_116299', 'x_116300', 'x_116301', 'x_116302', 'x_116303', 'x_116304', 'x_116305', 'x_116306', 'x_116307', 'x_116308', 'x_116309', 'x_116310', 'x_116311', 'x_116312', 'x_116313', 'x_116314', 'x_116315', 'x_116316', 'x_116317', 'x_116318', 'x_116319', 'x_116320', 'x_116321', 'x_116322', 'x_116323', 'x_116324', 'x_116325', 'x_116326', 'x_116327', 'x_116328', 'x_116329', 'x_116330', 'x_116331', 'x_116332', 'x_116333', 'x_116334', 'x_116335', 'x_116336', 'x_116337', 'x_116338', 'x_116339', 'x_116340', 'x_116341', 'x_116342', 'x_116343', 'x_116344', 'x_116345', 'x_116346', 'x_116347', 'x_116348', 'x_116349', 'x_116350', 'x_116351', 'x_116352', 'x_116353', 'x_116354', 'x_116355', 'x_116356', 'x_116357', 'x_116358', 'x_116359', 'x_116360', 'x_116361', 'x_116362', 'x_116363', 'x_116364', 'x_116365', 'x_116366', 'x_116367', 'x_116368', 'x_116369', 'x_116370', 'x_116371', 'x_116372', 'x_116373', 'x_116374', 'x_116375', 'x_116376', 'x_116377', 'x_116378']
In [11]:
print(dtrain.shape, dtest.shape)
(100, 115642) (601, 115641)
In [12]:
temp = pd.read_csv('imp_df-0.csv')

remove = temp['feat'][temp['diff'] >= 0]
#remove
for u in remove:
    if u in use:
        use.remove(u)
        
len(use)
Out[12]:
112825
In [13]:
temp = pd.read_csv('imp_df-1.csv')

remove = temp['feat'][temp['diff'] >= 0]
#remove
for u in remove:
    if u in use:
        use.remove(u)
        
len(use)
Out[13]:
101605
temp = pd.read_csv('imp_df-2.csv') remove = temp['feat'][temp['diff'] >= 0] #remove for u in remove: if u in use: use.remove(u) len(use)temp = pd.read_csv('imp_df-0.csv') use = list(temp['feat'][temp['diff'] < 0]) len(use)temp1 = pd.read_csv('imp_df-1.csv') use1 = list(temp1['feat'][temp1['diff'] < 0]) for u in use1: if u not in use: use.append(u) len(use)
In [14]:
target = dtrain['target']

svd = TruncatedSVD(n_components = 45, n_iter = 15, random_state = 42)
svd.fit(dtrain[use].values)
print(svd.explained_variance_ratio_.sum())

xtrain = svd.transform(dtrain[use].values)
xtest  = svd.transform(dtest[use].values)

dtrain = pd.DataFrame(xtrain)
dtest  = pd.DataFrame(xtest)

xuse = ['x_' + str(f) for f in range(dtrain.shape[1])]

dtrain.columns = xuse
dtrain['target'] = target

dtest.columns = xuse
use = xuse

dtrain.head()
0.9100863799262379
Out[14]:
x_0 x_1 x_2 x_3 x_4 x_5 x_6 x_7 x_8 x_9 ... x_36 x_37 x_38 x_39 x_40 x_41 x_42 x_43 x_44 target
0 665.347745 61.008411 7.478397 234.969242 1.860219 -36.285563 257.085407 -59.178237 50.854748 -124.160004 ... 43.604942 19.777652 27.203507 29.166768 53.762498 -0.194678 16.821663 -5.826462 16.599742 1
1 -164.259115 -660.473972 -329.311084 -154.422256 180.228304 -95.903074 -450.383136 483.737298 276.568393 614.044662 ... -36.370988 69.377785 26.731596 -26.265083 57.937470 52.809484 30.844033 7.480102 21.639360 0
2 -551.119638 -19.721721 279.854378 1058.988323 -68.721455 408.864109 -440.757909 423.665012 -200.041928 -676.242236 ... -221.107925 -199.454364 -55.708469 -288.397391 8.343854 132.603067 187.819706 22.807170 66.607220 1
3 487.408905 -358.594540 -34.508921 267.359841 76.387211 -208.288967 148.615855 -29.398435 84.436366 -154.206600 ... -5.731330 32.457181 58.627389 11.941469 -39.929008 37.576730 8.108722 -64.447498 41.338146 4
4 -907.094917 -65.134401 -137.229341 -187.632295 554.482389 -117.433242 -195.900825 -51.667340 -29.178691 54.870819 ... -60.075129 -9.423686 -21.739545 -32.864875 -79.617197 5.581825 -77.436023 -26.228361 -2.584403 1

5 rows × 46 columns

In [ ]:

In [15]:
################### permutation importance

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold, cross_val_score
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score

target = 'target'
dtrain.reset_index(drop = True, inplace = True)

def pim(xtrain, xuse, target):

    models = []
    Loss = []
    seeds    = 3
    n_splits = 5
    imp_df = pd.DataFrame(columns = ['feat', 'imp', 'imp_permuted'])
    I = 0

    for seed in range(seeds):
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(xtrain, xtrain[target]):            
            X_train, X_val = xtrain[use].loc[train_index],    xtrain[use].loc[test_index]
            y_train, y_val = xtrain[target].loc[train_index], xtrain[target].loc[test_index]
            
            model = LGBMRegressor(
                max_depth    = 3 + seed * 2, 
                random_state = seed, 
                n_estimators = 1000,
                #device       = 'gpu',
                subsample        = 0.98, 
                subsample_freq   = 5, 
                colsample_bytree = 0.98,
                reg_alpha        = 0.01, 
                reg_lambda       = 0.1
            )
            model.fit(
                X_train, 
                y_train.values,
                eval_set = (X_val, y_val), 
                early_stopping_rounds = 100,
                verbose               = False,
            )
            temp = model.predict(X_val)
            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            models.append(model)
            
    sc = np.mean(Loss)
                    
    for u in xuse:
        N = 0
        Loss_permuted = []
        for seed in range(seeds):
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(xtrain, xtrain[target]):           
                X_train, X_val = xtrain[use].loc[train_index],    xtrain[use].loc[test_index]
                y_train, y_val = xtrain[target].loc[train_index], xtrain[target].loc[test_index]
                
                X_val_permuted = X_val.copy()
                temp = X_val_permuted[u].values
                np.random.shuffle(temp)
                X_val_permuted[u] = temp
                
                temp = models[N].predict(X_val_permuted.values)
                sc_permuted = mean_squared_error(y_val, temp)
                Loss_permuted.append(sc_permuted) 
                N += 1
                    
        sc_permuted = np.mean(Loss_permuted)
    
        if sc_permuted <= sc:
            print(u, sc, sc_permuted, 'need delete')
        else:
            print(u, sc, sc_permuted)
        
        if u in imp_df['feat'].unique():
            imp_df['imp'][imp_df['feat'] == u] += sc
            imp_df['imp_permuted'][imp_df['feat'] == u] += sc_permuted
        else:
            L = imp_df.shape[0]
            imp_df.loc[L, 'feat'] = u
            imp_df['imp'][imp_df['feat'] == u] = sc
            imp_df['imp_permuted'][imp_df['feat'] == u] = sc_permuted
        
        imp_df['diff'] = imp_df['imp'] - imp_df['imp_permuted']
        imp_df.sort_values(by = ['diff'], inplace = True)
        imp_df.to_csv('imp_df.csv', index = False)
        
    imp_df[['imp', 'imp_permuted']] = imp_df[['imp', 'imp_permuted']] / (seeds * n_splits)

    ################### permutation importance

    imp_df['diff'] = imp_df['imp'] - imp_df['imp_permuted']
    imp_df.sort_values(by = ['diff'], inplace = True)

    remove = imp_df['feat'][imp_df['diff'] >= 0].values
    print(remove)

    for u in remove:
        if u in use:
            use.remove(u)
            
    return(xuse)        
        
target = 'target'
dtrain.reset_index(drop = True, inplace = True)
dtrain[target] = dtrain[target].astype(int)
use = pim(dtrain, use, target)
x_0 1.9339867187011441 1.9260438673833855 need delete
x_1 1.9339867187011441 1.9339867187011441 need delete
x_2 1.9339867187011441 1.9317100646282124 need delete
x_3 1.9339867187011441 1.9396635537440337
x_4 1.9339867187011441 1.9442546489626882
x_5 1.9339867187011441 1.9155553703430135 need delete
x_6 1.9339867187011441 1.9303053218223094 need delete
x_7 1.9339867187011441 1.9290217120798767 need delete
x_8 1.9339867187011441 1.9368604828823555
x_9 1.9339867187011441 2.256370267370543
x_10 1.9339867187011441 1.939700345653777
x_11 1.9339867187011441 1.92566597249631 need delete
x_12 1.9339867187011441 1.9634984084876483
x_13 1.9339867187011441 1.9401763883879362
x_14 1.9339867187011441 1.9437716574734112
x_15 1.9339867187011441 1.9218898480174806 need delete
x_16 1.9339867187011441 1.933798010490061 need delete
x_17 1.9339867187011441 1.933623092454455 need delete
x_18 1.9339867187011441 1.9268854566917824 need delete
x_19 1.9339867187011441 1.9486716400490316
x_20 1.9339867187011441 1.963375701430915
x_21 1.9339867187011441 1.9325249299327656 need delete
x_22 1.9339867187011441 1.9338757221577756 need delete
x_23 1.9339867187011441 1.94023757466203
x_24 1.9339867187011441 1.9547714725021694
x_25 1.9339867187011441 1.9331332313067928 need delete
x_26 1.9339867187011441 1.9287832652432735 need delete
x_27 1.9339867187011441 1.9208524309675672 need delete
x_28 1.9339867187011441 2.2468604185606873
x_29 1.9339867187011441 1.9339776436431544 need delete
x_30 1.9339867187011441 1.9245475823117417 need delete
x_31 1.9339867187011441 1.9525390872515265
x_32 1.9339867187011441 1.934115901424877
x_33 1.9339867187011441 1.9339867187011441 need delete
x_34 1.9339867187011441 1.933482916471365 need delete
x_35 1.9339867187011441 1.9350089098115868
x_36 1.9339867187011441 1.9648474599921144
x_37 1.9339867187011441 1.9339867187011441 need delete
x_38 1.9339867187011441 1.9323254700479073 need delete
x_39 1.9339867187011441 1.9617270531796416
x_40 1.9339867187011441 1.9559755997365633
x_41 1.9339867187011441 1.9277023996160783 need delete
x_42 1.9339867187011441 1.9324857224787233 need delete
x_43 1.9339867187011441 1.9347718749405975
x_44 1.9339867187011441 1.9339867187011441 need delete
['x_1' 'x_44' 'x_37' 'x_33' 'x_29' 'x_22' 'x_16' 'x_17' 'x_34' 'x_25'
 'x_21' 'x_42' 'x_38' 'x_2' 'x_6' 'x_7' 'x_26' 'x_41' 'x_18' 'x_0' 'x_11'
 'x_30' 'x_15' 'x_27' 'x_5']
In [16]:
use
Out[16]:
['x_3',
 'x_4',
 'x_8',
 'x_9',
 'x_10',
 'x_12',
 'x_13',
 'x_14',
 'x_19',
 'x_20',
 'x_23',
 'x_24',
 'x_28',
 'x_31',
 'x_32',
 'x_35',
 'x_36',
 'x_39',
 'x_40',
 'x_43']
In [ ]:

In [17]:
target = dtrain['target'].values

dtrain = dtrain[use]
dtest  = dtest[use]
dtrain['target'] = target
dtrain['target'] = dtrain['target'].astype(int)
gc.collect()
Out[17]:
182
In [18]:
dtrain.head()
Out[18]:
x_3 x_4 x_8 x_9 x_10 x_12 x_13 x_14 x_19 x_20 ... x_24 x_28 x_31 x_32 x_35 x_36 x_39 x_40 x_43 target
0 234.969242 1.860219 50.854748 -124.160004 2.820957 -72.305011 -0.787518 -179.410244 64.180675 34.828274 ... 16.707547 61.593103 -35.363700 26.145273 -14.322306 43.604942 29.166768 53.762498 -5.826462 1
1 -154.422256 180.228304 276.568393 614.044662 -402.210747 210.637253 457.552202 -295.063815 -640.691430 187.355371 ... 33.748974 -174.090849 -56.299179 603.621445 -8.414453 -36.370988 -26.265083 57.937470 7.480102 0
2 1058.988323 -68.721455 -200.041928 -676.242236 -195.270532 972.021663 -107.118557 -17.789619 266.252472 -197.238109 ... 385.307033 -155.672637 186.114755 162.335423 50.124042 -221.107925 -288.397391 8.343854 22.807170 1
3 267.359841 76.387211 84.436366 -154.206600 -44.934611 108.287200 71.411365 30.869205 -15.257457 57.558212 ... -12.583325 63.539872 -39.333358 -27.198102 18.405949 -5.731330 11.941469 -39.929008 -64.447498 4
4 -187.632295 554.482389 -29.178691 54.870819 -39.739966 -19.941638 22.359923 52.685056 -114.473131 -3.741718 ... -3.486473 61.688607 28.449911 -78.514572 44.842399 -60.075129 -32.864875 -79.617197 -26.228361 1

5 rows × 21 columns

In [19]:
# covariate-shift-adaptation

if 'is_train' in dtrain.columns:
    del dtrain['is_train']    
    
if 'is_train' in dtest.columns:
    del dtest['is_train']   
    
if 'is_train' in use:
    use.remove('is_train')

len_X = dtrain.shape[0]
dtrain['is_train'] = 1
dtest['is_train']  = 0
use.append('is_train')
dtrain['oof'] = 0

df     = pd.concat([dtrain[use], dtest[use]], axis = 0, ignore_index = True, sort = False)
target = df['is_train']
del dtrain['is_train'], dtest['is_train']
df['pred'] = 0
AUC = []
use.remove('is_train')


seeds    = 3
n_splits = 5
skf = StratifiedKFold(n_splits = n_splits, shuffle = False)

for seed in range(seeds):
    skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
    
    model = LGBMClassifier(
        max_depth = 3 + seed * 2, 
        random_state = seed, 
        n_estimators = 1000,
        #device       = 'gpu',
        subsample        = 0.98, 
        subsample_freq   = 5, 
        colsample_bytree = 0.98,
        reg_alpha        = 0.01, 
        reg_lambda       = 0.1
    )
    
    for train_index, test_index in skf.split(df, df['is_train']):
        X_train, X_test = df.loc[train_index],             df.loc[test_index]
        y_train, y_test = df['is_train'].loc[train_index], df['is_train'].loc[test_index]
        
        model.fit(
            X_train[use],
            y_train,
            eval_set = (X_test[use], y_test),
            verbose = False,
            early_stopping_rounds = 100,
        )
        
        df['pred'].loc[test_index] 
        temp = model.predict_proba(X_test[use])[:, 0]
        df['pred'].loc[test_index] += temp
    
        temp = model.predict_proba(X_test[use])[:, 1]
        sc = roc_auc_score(y_test, temp)
        print('roc_auc_score', sc)
        AUC.append(sc)
    
weights = df['pred'][:len_X]
weights /= np.mean(weights)

print(weights.min(), weights.max())
dtrain['weights'] = weights
roc_auc_score 0.7595041322314049
roc_auc_score 0.6829166666666666
roc_auc_score 0.7204166666666666
roc_auc_score 0.6254166666666667
roc_auc_score 0.6641666666666666
roc_auc_score 0.7200413223140497
roc_auc_score 0.6610416666666665
roc_auc_score 0.7370833333333333
roc_auc_score 0.63875
roc_auc_score 0.7112499999999999
roc_auc_score 0.6797520661157024
roc_auc_score 0.7570833333333333
roc_auc_score 0.6847916666666667
roc_auc_score 0.6675
roc_auc_score 0.6100000000000001
0.6240381314826711 1.1982149354909963
In [20]:
def get_params(train, target, use, n_splits, n_calls = 5, verbose = False):
    
    def score(params_temp):
        
        Loss     = []
        n_estimators = N_estimators
                
        params0 = {
            'subsample':         params_temp[0],
            'colsample_bytree':  params_temp[1],
            'reg_alpha':         params_temp[2],
            'reg_lambda':        params_temp[3],
            'learning_rate':     params_temp[4],
            'num_leaves':        params_temp[5],
            'max_depth':         params_temp[6],
            'n_estimators':      n_estimators,
            #'device':            'gpu',
        }
        params1 = {
            'subsample':         params_temp[7],
            'colsample_bytree':  params_temp[8],
            'reg_alpha':         params_temp[9],
            'reg_lambda':        params_temp[10],
            'learning_rate':     params_temp[11],
            'num_leaves':        params_temp[12],
            'max_depth':         params_temp[13],
            'n_estimators':      n_estimators,
            #'device':            'gpu',
        } 
        params2 = {
            'subsample':         params_temp[14],
            'colsample_bytree':  params_temp[15],
            'reg_alpha':         params_temp[16],
            'reg_lambda':        params_temp[17],
            'learning_rate':     params_temp[18],
            'num_leaves':        params_temp[19],
            'max_depth':         params_temp[20],
            'n_estimators':      n_estimators,
            #'device':            'gpu',
        }
        
        oof = np.zeros([train.shape[0]])
        oof = pd.DataFrame(oof, columns = ['oof'], index = train.index)
        
        for seed in range(seeds):            
            params0['random_state'] = seed            
            clf0 = LGBMRegressor(**params0)
            
            params1['random_state'] = seed
            clf1  = LGBMRegressor(**params1)
            
            params2['random_state'] = seed
            clf2  = LGBMRegressor(**params2)
            
            esr = 100
            I = 0
            
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(train, train[target]):
                X_trg, X_val = train[use].loc[train_index],    train[use].loc[test_index]
                y_trg, y_val = train[target].loc[train_index], train[target].loc[test_index]
            
                clf0.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp0 = clf0.predict(X_val)
                    
                clf1.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp1 = clf1.predict(X_val)
            
                clf2.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr, 
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp2 = clf2.predict(X_val)                      
                    
                temp = (temp0 + temp1 + temp2) / 3              

                sc = mean_squared_error(y_val, temp)
                Loss.append(sc)

        
        #oof['oof'] = oof['oof'] / I
        #oof = oof['oof'].values
        #Loss = np.sqrt(mean_squared_error(dtrain[target], oof))
        Loss = np.mean(Loss)
        
        L = df_res.shape[0] + 1
        df_res.loc[L, 'rmse'] = Loss
        best = df_res['rmse'].min()
        
        print('rmse...', Loss, 'Best...', best, 'Iter ', L)
        #if L > 1:
        #    plt.plot(range(df_res.shape[0]), df_res['rmse'], 'r')
        #    plt.show()
        
        #print(params0)
        return(Loss) 
    
    df_res = pd.DataFrame()
    values  = [
        Real(0.90,   1,    "log-uniform",   name = 'subsample'),
        Real(0.90,   1,    "log-uniform",   name = 'colsample_bytree'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_alpha'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_lambda'),
        Real(0.03,   0.3,  "log-uniform",   name = 'learning_rate'),
        Integer(5,   50,                    name = 'num_leaves'),
        Integer(3,   50,                    name = 'max_depth'),
        
        Real(0.90,   1,    "log-uniform",   name = 'subsample'),
        Real(0.90,   1,    "log-uniform",   name = 'colsample_bytree'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_alpha'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_lambda'),
        Real(0.03,   0.3,  "log-uniform",   name = 'learning_rate'),
        Integer(5,   75,                    name = 'num_leaves'),
        Integer(3,   75,                    name = 'max_depth'),
        
        Real(0.90,   1,    "log-uniform",   name = 'subsample'),
        Real(0.90,   1,    "log-uniform",   name = 'colsample_bytree'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_alpha'),
        Real(1e-14,  0.2,  "log-uniform",   name = 'reg_lambda'),
        Real(0.03,   0.3,  "log-uniform",   name = 'learning_rate'),
        Integer(5,   75,                    name = 'num_leaves'),
        Integer(3,   75,                    name = 'max_depth'),

    ]
    res_gp = gp_minimize(score, values, n_calls = n_calls, random_state = 142, n_random_starts = 3)
    n_estimators = N_estimators
    params0 = {
        'subsample':         res_gp.x[0],
        'colsample_bytree':  res_gp.x[1],
        'reg_alpha':         res_gp.x[2],
        'reg_lambda':        res_gp.x[3],
        'learning_rate':     res_gp.x[4],
        'num_leaves':        res_gp.x[5],
        'max_depth':         res_gp.x[6],
        'n_estimators':      n_estimators,
        #'device':            'gpu',
    }
    
    params1 = {
        'subsample':         res_gp.x[7],
        'colsample_bytree':  res_gp.x[8],
        'reg_alpha':         res_gp.x[9],
        'reg_lambda':        res_gp.x[10],
        'learning_rate':     res_gp.x[11],
        'num_leaves':        res_gp.x[12],
        'max_depth':         res_gp.x[13],
        'n_estimators':      n_estimators,
        #'device':            'gpu',
    }
    
    params2 = {
        'subsample':         res_gp.x[14],
        'colsample_bytree':  res_gp.x[15],
        'reg_alpha':         res_gp.x[16],
        'reg_lambda':        res_gp.x[17],
        'learning_rate':     res_gp.x[18],
        'num_leaves':        res_gp.x[19],
        'max_depth':         res_gp.x[20],
        'n_estimators':      n_estimators,
        #'device':            'gpu',
    }

    print('\n', 'Best score', res_gp.fun, '\n')
    print(params0, '\n', params1, '\n', params2)
    return(params0, params1, params2)
In [21]:
seeds    = 5
n_splits = 7
N_estimators = 1000
target = 'target'

params0, params1, params2 = get_params(dtrain, target, use, n_splits, 10)
rmse... 1.7005283702618392 Best... 1.7005283702618392 Iter  1
rmse... 1.6899261134828916 Best... 1.6899261134828916 Iter  2
rmse... 1.6924281059999033 Best... 1.6899261134828916 Iter  3
rmse... 1.6822906260154833 Best... 1.6822906260154833 Iter  4
rmse... 1.691543892997744 Best... 1.6822906260154833 Iter  5
rmse... 1.6666513713365207 Best... 1.6666513713365207 Iter  6
rmse... 1.6539211114189913 Best... 1.6539211114189913 Iter  7
rmse... 1.6349251318291007 Best... 1.6349251318291007 Iter  8
rmse... 1.6889502238064658 Best... 1.6349251318291007 Iter  9
rmse... 1.6814596650146065 Best... 1.6349251318291007 Iter  10

 Best score 1.6349251318291007 

{'subsample': 0.914233740611371, 'colsample_bytree': 0.9339041674510885, 'reg_alpha': 0.000359823678231207, 'reg_lambda': 0.2, 'learning_rate': 0.07976796062066106, 'num_leaves': 38, 'max_depth': 16, 'n_estimators': 1000} 
 {'subsample': 0.9223510269284443, 'colsample_bytree': 0.9, 'reg_alpha': 0.0074024590481468235, 'reg_lambda': 3.116421681339948e-11, 'learning_rate': 0.29999999999999993, 'num_leaves': 73, 'max_depth': 3, 'n_estimators': 1000} 
 {'subsample': 0.932767757138783, 'colsample_bytree': 0.9026487694066411, 'reg_alpha': 0.0002815354465164848, 'reg_lambda': 1.0831441915802143e-05, 'learning_rate': 0.2069421811919979, 'num_leaves': 13, 'max_depth': 58, 'n_estimators': 1000}
In [ ]:

In [22]:
use
Out[22]:
['x_3',
 'x_4',
 'x_8',
 'x_9',
 'x_10',
 'x_12',
 'x_13',
 'x_14',
 'x_19',
 'x_20',
 'x_23',
 'x_24',
 'x_28',
 'x_31',
 'x_32',
 'x_35',
 'x_36',
 'x_39',
 'x_40',
 'x_43']
In [ ]:

In [23]:
Loss     = []
n_estimators = 10000
dtest[target] = 0

if True:
        
        for seed in range(seeds):            
            params0['random_state'] = seed            
            clf0 = LGBMRegressor(**params0)
            
            params1['random_state'] = seed
            clf1  = LGBMRegressor(**params1)
            
            params2['random_state'] = seed
            clf2  = LGBMRegressor(**params2)
            
            esr = 100
            I = 0
            
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(dtrain, dtrain[target]):
                X_trg, X_val = dtrain[use].loc[train_index],    dtrain[use].loc[test_index]
                y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
                clf0.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp0 = clf0.predict(X_val)
                pred0 = clf0.predict(dtest[use])
                    
                clf1.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr, 
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp1 = clf1.predict(X_val)
                pred1 = clf1.predict(dtest[use])
            
                clf2.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp2 = clf2.predict(X_val)
                pred2 = clf2.predict(dtest[use])
                
                pred = (pred0 + pred1 + pred2) / 3
                dtest[target] += pred                
                    
                temp = (temp0 + temp1 + temp2) / 3              

                sc = mean_squared_error(y_val, temp)
                Loss.append(sc)
                
            print(np.mean(Loss))
        
print(np.mean(Loss))
                
dtest[target] = dtest[target] / (seeds * n_splits)
1.4406613111818205
1.5970538296865253
1.58150957740641
1.60402251621049
1.6349251318291007
1.6349251318291007
In [ ]:

In [24]:
################### RFC
np.random.shuffle(use)

I = 0
remove   = []

for u in use:
    Loss = []
    dtest[target] = 0
    for seed in range(seeds):        
        if I == 0:
            usen = use.copy()
        else:
            usen = use.copy()
            usen.remove(u)
            for f in remove:
                if f in usen:
                    usen.remove(f)            
        
        params0['random_state'] = seed            
        clf0 = LGBMRegressor(**params0)
            
        params1['random_state'] = seed
        clf1  = LGBMRegressor(**params1)
            
        params2['random_state'] = seed
        clf2  = LGBMRegressor(**params2)
            
        esr = 100
            
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(dtrain, dtrain[target]):
            X_trg, X_val = dtrain[usen].loc[train_index],    dtrain[usen].loc[test_index]
            y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
            clf0.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp0 = clf0.predict(X_val)
            pred0 = clf0.predict(dtest[usen])
                    
            clf1.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr, 
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp1 = clf1.predict(X_val)
            pred1 = clf1.predict(dtest[usen])
            
            clf2.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp2 = clf2.predict(X_val)
            pred2 = clf2.predict(dtest[usen])
                
            pred = (pred0 + pred1 + pred2) / 3
            dtest[target] += pred                
                    
            temp = (temp0 + temp1 + temp2) / 3              

            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            
    if I == 0:
        best = np.mean(Loss)
        print('start score', best)
    else:
        if best > np.mean(Loss):
            print(u, best, np.mean(Loss))
            best = np.mean(Loss)
            remove.append(u)
            print(remove)
        else:
            print(u, '		', best, np.mean(Loss))  
    
    del Loss                
    I += 1

################### RFC

print(remove)

for u in remove:
    if u in use:
        use.remove(u)
start score 1.6569092456546846
x_39 		 1.6569092456546846 1.7029584082759301
x_10 		 1.6569092456546846 1.68324277503502
x_12 1.6569092456546846 1.6041848960678078
['x_12']
x_31 1.6041848960678078 1.6035895957270514
['x_12', 'x_31']
x_35 1.6035895957270514 1.5926631996245522
['x_12', 'x_31', 'x_35']
x_23 1.5926631996245522 1.5803677022372167
['x_12', 'x_31', 'x_35', 'x_23']
x_24 1.5803677022372167 1.5479813241076823
['x_12', 'x_31', 'x_35', 'x_23', 'x_24']
x_3 		 1.5479813241076823 1.5644736770748013
x_14 		 1.5479813241076823 1.5564052434984226
x_8 1.5479813241076823 1.5297986813102957
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8']
x_36 		 1.5297986813102957 1.6130993077250164
x_40 		 1.5297986813102957 1.555984811946637
x_28 		 1.5297986813102957 1.6925173555440967
x_20 1.5297986813102957 1.5271064026960743
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20']
x_9 		 1.5271064026960743 1.8411891509516514
x_13 		 1.5271064026960743 1.548138327673871
x_19 1.5271064026960743 1.4978331123313762
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20', 'x_19']
x_32 1.4978331123313762 1.4460745206551833
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20', 'x_19', 'x_32']
x_4 		 1.4460745206551833 1.4771024752878223
['x_12', 'x_31', 'x_35', 'x_23', 'x_24', 'x_8', 'x_20', 'x_19', 'x_32']
In [25]:
################### RFC
np.random.shuffle(use)

I = 0
remove   = []

for u in use:
    Loss = []
    dtest[target] = 0
    for seed in range(seeds):        
        if I == 0:
            usen = use.copy()
        else:
            usen = use.copy()
            usen.remove(u)
            for f in remove:
                if f in usen:
                    usen.remove(f)            
        
        params0['random_state'] = seed            
        clf0 = LGBMRegressor(**params0)
            
        params1['random_state'] = seed
        clf1  = LGBMRegressor(**params1)
            
        params2['random_state'] = seed
        clf2  = LGBMRegressor(**params2)
            
        esr = 100
            
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(dtrain, dtrain[target]):
            X_trg, X_val = dtrain[usen].loc[train_index],    dtrain[usen].loc[test_index]
            y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
            clf0.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp0 = clf0.predict(X_val)
            pred0 = clf0.predict(dtest[usen])
                    
            clf1.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr, 
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp1 = clf1.predict(X_val)
            pred1 = clf1.predict(dtest[usen])
            
            clf2.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp2 = clf2.predict(X_val)
            pred2 = clf2.predict(dtest[usen])
                
            pred = (pred0 + pred1 + pred2) / 3
            dtest[target] += pred                
                    
            temp = (temp0 + temp1 + temp2) / 3              

            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            
    if I == 0:
        best = np.mean(Loss)
        print('start score', best)
    else:
        if best > np.mean(Loss):
            print(u, best, np.mean(Loss))
            best = np.mean(Loss)
            remove.append(u)
            print(remove)
        else:
            print(u, '		', best, np.mean(Loss))  
    
    del Loss                
    I += 1

################### RFC

print(remove)

for u in remove:
    if u in use:
        use.remove(u)
start score 1.4496209434191165
x_4 		 1.4496209434191165 1.4682643669482238
x_40 		 1.4496209434191165 1.4523718141470239
x_43 1.4496209434191165 1.4319693304928505
['x_43']
x_9 		 1.4319693304928505 1.7843356983765593
x_39 		 1.4319693304928505 1.4579145281323809
x_13 1.4319693304928505 1.4087640357927358
['x_43', 'x_13']
x_28 		 1.4087640357927358 1.7908206024220463
x_3 		 1.4087640357927358 1.4640140038281158
x_14 		 1.4087640357927358 1.4417577413574154
x_10 		 1.4087640357927358 1.5439146030889228
['x_43', 'x_13']
In [26]:
################### RFC
np.random.shuffle(use)

I = 0
remove   = []

for u in use:
    Loss = []
    dtest[target] = 0
    for seed in range(seeds):        
        if I == 0:
            usen = use.copy()
        else:
            usen = use.copy()
            usen.remove(u)
            for f in remove:
                if f in usen:
                    usen.remove(f)            
        
        params0['random_state'] = seed            
        clf0 = LGBMRegressor(**params0)
            
        params1['random_state'] = seed
        clf1  = LGBMRegressor(**params1)
            
        params2['random_state'] = seed
        clf2  = LGBMRegressor(**params2)
            
        esr = 100
            
        skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
        for train_index, test_index in skf.split(dtrain, dtrain[target]):
            X_trg, X_val = dtrain[usen].loc[train_index],    dtrain[usen].loc[test_index]
            y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
            clf0.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp0 = clf0.predict(X_val)
            pred0 = clf0.predict(dtest[usen])
                    
            clf1.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr, 
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp1 = clf1.predict(X_val)
            pred1 = clf1.predict(dtest[usen])
            
            clf2.fit(
                X_trg, 
                y_trg, 
                verbose = False, 
                eval_set = (X_val, y_val),
                early_stopping_rounds = esr,
                sample_weight = dtrain['weights'].loc[train_index]
            )                      
            temp2 = clf2.predict(X_val)
            pred2 = clf2.predict(dtest[usen])
                
            pred = (pred0 + pred1 + pred2) / 3
            dtest[target] += pred                
                    
            temp = (temp0 + temp1 + temp2) / 3              

            sc = mean_squared_error(y_val, temp)
            Loss.append(sc)
            
    if I == 0:
        best = np.mean(Loss)
        print('start score', best)
    else:
        if best > np.mean(Loss):
            print(u, best, np.mean(Loss))
            best = np.mean(Loss)
            remove.append(u)
            print(remove)
        else:
            print(u, '		', best, np.mean(Loss))  
    
    del Loss                
    I += 1

################### RFC

print(remove)

for u in remove:
    if u in use:
        use.remove(u)
start score 1.3982122586858419
x_36 		 1.3982122586858419 1.5788958115324099
x_28 		 1.3982122586858419 1.7741863960034117
x_10 		 1.3982122586858419 1.5308340007123231
x_14 		 1.3982122586858419 1.4560194114116045
x_9 		 1.3982122586858419 1.746497185374684
x_40 		 1.3982122586858419 1.4539260651081267
x_4 		 1.3982122586858419 1.4354286356760073
x_39 		 1.3982122586858419 1.434863671265331
[]
In [ ]:

In [27]:
seeds    = 3
n_splits = 5
N_estimators = 1000
target = 'target'

params0, params1, params2 = get_params(dtrain, target, use, n_splits, 10)
rmse... 1.5709147217550457 Best... 1.5709147217550457 Iter  1
rmse... 1.5508985479411896 Best... 1.5508985479411896 Iter  2
rmse... 1.573016489795622 Best... 1.5508985479411896 Iter  3
rmse... 1.5582837759831583 Best... 1.5508985479411896 Iter  4
rmse... 1.5681214611351213 Best... 1.5508985479411896 Iter  5
rmse... 1.57353184593852 Best... 1.5508985479411896 Iter  6
rmse... 1.5758244091568625 Best... 1.5508985479411896 Iter  7
rmse... 1.5564757482864828 Best... 1.5508985479411896 Iter  8
rmse... 1.5856977651219941 Best... 1.5508985479411896 Iter  9
rmse... 1.5145039312543003 Best... 1.5145039312543003 Iter  10

 Best score 1.5145039312543003 

{'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 0.2, 'reg_lambda': 0.2, 'learning_rate': 0.29999999999999993, 'num_leaves': 50, 'max_depth': 50, 'n_estimators': 1000} 
 {'subsample': 0.915866702114476, 'colsample_bytree': 1.0, 'reg_alpha': 0.2, 'reg_lambda': 1e-14, 'learning_rate': 0.29999999999999993, 'num_leaves': 75, 'max_depth': 3, 'n_estimators': 1000} 
 {'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 0.2, 'reg_lambda': 0.2, 'learning_rate': 0.29999999999999993, 'num_leaves': 75, 'max_depth': 3, 'n_estimators': 1000}
In [ ]:

In [28]:
Loss     = []
n_estimators = 10000
dtest[target] = 0

if True:
        
        for seed in range(seeds):            
            params0['random_state'] = seed            
            clf0 = LGBMRegressor(**params0)
            
            params1['random_state'] = seed
            clf1  = LGBMRegressor(**params1)
            
            params2['random_state'] = seed
            clf2  = LGBMRegressor(**params2)
            
            esr = 100
            I = 0
            
            skf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = seed)
            for train_index, test_index in skf.split(dtrain, dtrain[target]):
                X_trg, X_val = dtrain[use].loc[train_index],    dtrain[use].loc[test_index]
                y_trg, y_val = dtrain[target].loc[train_index], dtrain[target].loc[test_index]
            
                clf0.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp0 = clf0.predict(X_val)
                pred0 = clf0.predict(dtest[use])
                    
                clf1.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr, 
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp1 = clf1.predict(X_val)
                pred1 = clf1.predict(dtest[use])
            
                clf2.fit(
                    X_trg, 
                    y_trg, 
                    verbose = False, 
                    eval_set = (X_val, y_val),
                    early_stopping_rounds = esr,
                    sample_weight = dtrain['weights'].loc[train_index]
                )                      
                temp2 = clf2.predict(X_val)
                pred2 = clf2.predict(dtest[use])
                
                pred = (pred0 + pred1 + pred2) / 3
                dtest[target] += pred                
                    
                temp = (temp0 + temp1 + temp2) / 3              

                sc = mean_squared_error(y_val, temp)
                Loss.append(sc)
                
            print(np.mean(Loss))
        
print(np.mean(Loss))
                
dtest[target] = dtest[target] / (seeds * n_splits)
1.4975615504024486
1.5057990276432522
1.5145039312543003
1.5145039312543003
In [ ]:

In [42]:
predictions = dtest[target].values

submission = pd.DataFrame({"label":predictions})
submission['label'][submission['label'] < 0] = 0
#submission['label'] = submission['label'] - predictions.min()
submission
Out[42]:
label
0 2.259465
1 2.437215
2 3.329918
3 3.048174
4 3.143170
... ...
596 1.271802
597 2.127097
598 2.987384
599 2.911476
600 2.980742

601 rows × 1 columns

In [43]:
submission['label'].min()
Out[43]:
0.0
In [44]:
dtrain.target.hist()
Out[44]:
<AxesSubplot:>
In [45]:
submission.label.hist()
Out[45]:
<AxesSubplot:>
In [46]:
!rm -rf assets
!mkdir assets
submission.to_csv(os.path.join("assets", "submission.csv"))
stop
In [ ]:

In [47]:
%load_ext aicrowd.magic
%aicrowd login
The aicrowd.magic extension is already loaded. To reload it, use:
  %reload_ext aicrowd.magic
Please login here: https://api.aicrowd.com/auth/nIlh0hH8gnHLzsZSCsXVjfHBSS0LCR35VnrM6legWTk
Login Error: Couldn't login. Max retries exceeded
Please try logging in again
Using notebook: /mnt/c/Kaggle/AI6/LiDAR/01/08.ipynb for submission...
Removing existing files from submission directory...
Scrubbing API keys from the notebook...
Collecting notebook...
submission.zip ━━━━━━━━━━━━━━━━━━━━━━ 100.0%81.4/79.8 KB1.7 MB/s0:00:000:00:01
                                                   ╭─────────────────────────╮                                                    
                                                   │ Successfully submitted! │                                                    
                                                   ╰─────────────────────────╯                                                    
                                                         Important links                                                          
┌──────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│  This submission │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection/submissions/155331              │
│                  │                                                                                                             │
│  All submissions │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection/submissions?my_submissions=true │
│                  │                                                                                                             │
│      Leaderboard │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection/leaderboards                    │
│                  │                                                                                                             │
│ Discussion forum │ https://discourse.aicrowd.com/c/ai-blitz-xi                                                                 │
│                  │                                                                                                             │
│   Challenge page │ https://www.aicrowd.com/challenges/ai-blitz-xi/problems/lidar-car-detection                                 │
└──────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
In [ ]:


Comments

You must login before you can post a comment.

Execute