Loading

NeurIPS 2020: Procgen Competition

bookcamp

machine example

newton7777
In [ ]:
import os
In [ ]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/kaggle"
In [ ]:
!kaggle datasets download -d CooperUnion/cardataset
Downloading cardataset.zip to /content
  0% 0.00/103k [00:00<?, ?B/s]
100% 103k/103k [00:00<00:00, 33.2MB/s]
In [ ]:
!cd drive/MyDrive/kaggle
In [ ]:
!kaggle datasets download -d CooperUnion/cardataset --force
Downloading cardataset.zip to /content/drive/MyDrive/kaggle
  0% 0.00/103k [00:00<?, ?B/s]
100% 103k/103k [00:00<00:00, 13.7MB/s]
In [ ]:
!ls
cardataset.zip	data.csv  kaggle.json
In [ ]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
df = pd.read_csv("data.csv")
In [ ]:
df.head()
Out[ ]:
Make Model Year Engine Fuel Type Engine HP Engine Cylinders Transmission Type Driven_Wheels Number of Doors Market Category Vehicle Size Vehicle Style highway MPG city mpg Popularity MSRP
0 BMW 1 Series M 2011 premium unleaded (required) 335.0 6.0 MANUAL rear wheel drive 2.0 Factory Tuner,Luxury,High-Performance Compact Coupe 26 19 3916 46135
1 BMW 1 Series 2011 premium unleaded (required) 300.0 6.0 MANUAL rear wheel drive 2.0 Luxury,Performance Compact Convertible 28 19 3916 40650
2 BMW 1 Series 2011 premium unleaded (required) 300.0 6.0 MANUAL rear wheel drive 2.0 Luxury,High-Performance Compact Coupe 28 20 3916 36350
3 BMW 1 Series 2011 premium unleaded (required) 230.0 6.0 MANUAL rear wheel drive 2.0 Luxury,Performance Compact Coupe 28 18 3916 29450
4 BMW 1 Series 2011 premium unleaded (required) 230.0 6.0 MANUAL rear wheel drive 2.0 Luxury Compact Convertible 28 18 3916 34500
In [ ]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
 string_coloumns = list(df.dtypes[df.dtypes == 'object'].index)
 for col in string_coloumns:
   df[col] = df[col].str.lower().str.replace(' ',' ')
In [ ]:
string_coloumns
Out[ ]:
['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'market_category',
 'vehicle_size',
 'vehicle_style']
In [ ]:
df.head()
Out[ ]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
0 bmw 1 series m 2011 premium unleaded (required) 335.0 6.0 manual rear wheel drive 2.0 factory tuner,luxury,high-performance compact coupe 26 19 3916 46135
1 bmw 1 series 2011 premium unleaded (required) 300.0 6.0 manual rear wheel drive 2.0 luxury,performance compact convertible 28 19 3916 40650
2 bmw 1 series 2011 premium unleaded (required) 300.0 6.0 manual rear wheel drive 2.0 luxury,high-performance compact coupe 28 20 3916 36350
3 bmw 1 series 2011 premium unleaded (required) 230.0 6.0 manual rear wheel drive 2.0 luxury,performance compact coupe 28 18 3916 29450
4 bmw 1 series 2011 premium unleaded (required) 230.0 6.0 manual rear wheel drive 2.0 luxury compact convertible 28 18 3916 34500
In [ ]:
sns.distplot(df.msrp,kde = False)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6ffb934790>
In [ ]:
sns.distplot(df.msrp[df.msrp < 100000]  ,kde = False)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6ff1073490>
In [ ]:
log_price = np.log1p(df.msrp)
In [ ]:
sns.distplot(log_price, kde=False)
/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6ff183b950>
In [ ]:
df.isnull().sum()
Out[ ]:
make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64
In [ ]:
n = len(df)

n_test = int(n*0.2)
n_val = int(n*2)
n_train = n - (n_test + n_val)

np.random.seed(2)
idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
df_train = df_shuffled[:n_train].copy()
df_val = df_shuffled[n_train:n_train + n_val].copy() 
df_test = df_shuffled[n_train + n_val:].copy()
In [ ]:
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

To avoid accidentally using the target variable later, let’s remove it from the dataframes:

In [ ]:
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2897             try:
-> 2898                 return self._engine.get_loc(casted_key)
   2899             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'msrp'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-63-c51c5c6c90df> in <module>()
----> 1 del df_train['msrp']
      2 del df_val['msrp']
      3 del df_test['msrp']

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __delitem__(self, key)
   3711             # there was no match, this call should raise the appropriate
   3712             # exception:
-> 3713             loc = self.axes[-1].get_loc(key)
   3714             self._mgr.idelete(loc)
   3715 

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2898                 return self._engine.get_loc(casted_key)
   2899             except KeyError as err:
-> 2900                 raise KeyError(key) from err
   2901 
   2902         if tolerance is not None:

KeyError: 'msrp'
In [ ]:
/content/result.csv

Comments

You must login before you can post a comment.

Execute