In [ ]:
import os
In [ ]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/kaggle"
In [ ]:
!kaggle datasets download -d CooperUnion/cardataset
In [ ]:
!cd drive/MyDrive/kaggle
In [ ]:
!kaggle datasets download -d CooperUnion/cardataset --force
In [ ]:
!ls
In [ ]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
In [ ]:
df = pd.read_csv("data.csv")
In [ ]:
df.head()
Out[ ]:
In [ ]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
string_coloumns = list(df.dtypes[df.dtypes == 'object'].index)
for col in string_coloumns:
df[col] = df[col].str.lower().str.replace(' ',' ')
In [ ]:
string_coloumns
Out[ ]:
In [ ]:
df.head()
Out[ ]:
In [ ]:
sns.distplot(df.msrp,kde = False)
Out[ ]:
In [ ]:
sns.distplot(df.msrp[df.msrp < 100000] ,kde = False)
Out[ ]:
In [ ]:
log_price = np.log1p(df.msrp)
In [ ]:
sns.distplot(log_price, kde=False)
Out[ ]:
In [ ]:
df.isnull().sum()
Out[ ]:
In [ ]:
n = len(df)
n_test = int(n*0.2)
n_val = int(n*2)
n_train = n - (n_test + n_val)
np.random.seed(2)
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]
df_train = df_shuffled[:n_train].copy()
df_val = df_shuffled[n_train:n_train + n_val].copy()
df_test = df_shuffled[n_train + n_val:].copy()
In [ ]:
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)
To avoid accidentally using the target variable later, let’s remove it from the dataframes:
In [ ]:
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']
In [ ]:
/content/result.csv
Content
Comments
You must login before you can post a comment.