Get data and testing
'''
#For getting the cleaned data [Was run on vscode]
import pandas as pd
import numpy as np
import category_encoders as ce
colnames=["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price" ]
df=pd.read_csv("df/imports-85.data", header=None)
df.columns=colnames
len(df)
#Missing value denotated by "?"
#Removing missing value
temp=df[df!="?"]
df=temp.dropna()
len(df) #Removed 46 rows filled with missing value
df.head()
df.dtypes
#We'll just use the god damn categorical encoder, i think. Cuz why not?
#changing columns that's numerical but labelled object. And save colname with categorical
categoricalcolname=[]; Numericalcolname=[]
for i in colnames:
try:
df[i]=df[i].astype("float32")
Numericalcolname.append(i)
except:
categoricalcolname.append(i) if df[i].dtype=="object" else 0
#df sorted by lowest -> highest price (Cuz we'll be using 'Linear' regression, and this is a way to avoid encoding bias :sob:)
df=df.sort_values(by=["price"])
#get columns with categorical data
encoder = ce.OrdinalEncoder(cols=categoricalcolname)
df=encoder.fit_transform(df)
df=df.astype("float32")
pd.DataFrame.to_csv(df, "DF/cleaned.csv")
#Coba coba standarisasi
#testing
df=pd.read_csv("DF/cleaned.csv")
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale
X=df.iloc[:,1:]
y=X.pop("price")
#Normal regression
model=LinearRegression()
model.fit(X,y )
model.score(X,y) #0.9136574938779927
#Standarized thing
Xscaled=scale(X)
model=LinearRegression()
model.fit(Xscaled,y )
model.score(Xscaled,y) #0.9136574938779927
#Standarized the numericalcolname only
for i in Numericalcolname:
df[i]=scale(df[i])
model=LinearRegression()
model.fit(Xscaled,y )
model.score(Xscaled,y) #0.9136574938779927
'''
## '\n#For getting the cleaned data [Was run on vscode]\nimport pandas as pd\nimport numpy as np\nimport category_encoders as ce\n\ncolnames=["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price" ]\ndf=pd.read_csv("df/imports-85.data", header=None)\ndf.columns=colnames\nlen(df)\n\n#Missing value denotated by "?"\n#Removing missing value\ntemp=df[df!="?"]\ndf=temp.dropna()\nlen(df) #Removed 46 rows filled with missing value\n\ndf.head()\ndf.dtypes\n\n#We\'ll just use the god damn categorical encoder, i think. Cuz why not?\n#changing columns that\'s numerical but labelled object. And save colname with categorical\ncategoricalcolname=[]; Numericalcolname=[]\nfor i in colnames:\n try:\n df[i]=df[i].astype("float32")\n Numericalcolname.append(i)\n except:\n categoricalcolname.append(i) if df[i].dtype=="object" else 0\n \n#df sorted by lowest -> highest price (Cuz we\'ll be using \'Linear\' regression, and this is a way to avoid encoding bias :sob:)\ndf=df.sort_values(by=["price"])\n\n#get columns with categorical data\nencoder = ce.OrdinalEncoder(cols=categoricalcolname)\ndf=encoder.fit_transform(df)\ndf=df.astype("float32")\n\npd.DataFrame.to_csv(df, "DF/cleaned.csv")\n\n\n#Coba coba standarisasi\n\n#testing\ndf=pd.read_csv("DF/cleaned.csv")\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.preprocessing import scale\n\nX=df.iloc[:,1:]\ny=X.pop("price")\n\n#Normal regression\nmodel=LinearRegression()\nmodel.fit(X,y )\nmodel.score(X,y) #0.9136574938779927\n\n#Standarized thing\nXscaled=scale(X)\n\nmodel=LinearRegression()\nmodel.fit(Xscaled,y )\nmodel.score(Xscaled,y) #0.9136574938779927\n\n#Standarized the numericalcolname only\nfor i in Numericalcolname:\n df[i]=scale(df[i])\n\nmodel=LinearRegression()\nmodel.fit(Xscaled,y )\nmodel.score(Xscaled,y) #0.9136574938779927\n'
R #Modules
library(lmtest)
library(dgof)
library(glmnet)
library(car)
library(PerformanceAnalytics)
library(lmridge)
library(MASS)
#Input data
df=read.csv("DF/cleaned.csv")
df=df[2:length(df)]
head(df)
## symboling normalized.losses make fuel.type aspiration num.of.doors body.style
## 1 2 83 1 1 1 1 1
## 2 2 121 2 1 1 1 1
## 3 1 104 3 1 1 1 1
## 4 1 87 4 1 1 1 1
## 5 2 161 5 1 1 1 1
## 6 1 101 6 1 1 1 1
## drive.wheels engine.location wheel.base length width height curb.weight
## 1 1 1 93.7 156.9 63.4 53.7 2050
## 2 1 1 88.4 141.1 60.3 53.2 1488
## 3 1 1 93.1 159.1 64.2 54.1 1890
## 4 1 1 95.7 158.7 63.6 54.5 1985
## 5 1 1 93.7 157.3 64.4 50.8 1918
## 6 1 1 93.7 150.0 64.0 52.6 1837
## engine.type num.of.cylinders engine.size fuel.system bore stroke
## 1 1 1 97 1 3.62 2.36
## 2 2 2 61 1 2.91 3.03
## 3 3 1 91 1 3.03 3.15
## 4 3 1 92 1 3.05 3.03
## 5 3 1 92 1 2.97 3.23
## 6 3 1 79 2 2.91 3.07
## compression.ratio horsepower peak.rpm city.mpg highway.mpg price
## 1 9.0 69 4900 31 36 5118
## 2 9.5 48 5100 47 53 5151
## 3 9.0 68 5000 30 31 5195
## 4 9.0 62 4800 35 39 5348
## 5 9.4 68 5500 37 41 5389
## 6 10.1 60 5500 38 42 5399
sum(is.na(df))
## [1] 0
#Model ##Linear Regression
#Normal linear regression
model.linear= lm(price~symboling+normalized.losses+ make+fuel.type+ aspiration+num.of.doors+ body.style+drive.wheels+ engine.location+wheel.base+ length+width+ height+curb.weight+ engine.type+num.of.cylinders+ engine.size+fuel.system+ bore+stroke+ compression.ratio+horsepower+ peak.rpm+city.mpg+ highway.mpg, data=df)
AIC.linear=AIC(model.linear)
ADJrsq.linear=summary(model.linear)[9]
###Stepwise/Backward/Foward
#---Stepwise Regression Model----
step.model <- stepAIC(model.linear, direction = "both", trace = F)
#summary(step.model)
AIC.step=AIC(step.model)
ADJrsq.step=summary(step.model)[9]
#---Backward Regression Model----
back.model <- stepAIC(model.linear, direction = "backward", trace = F)
#summary(back.model)
AIC.back=AIC(back.model)
ADJrsq.back=summary(back.model)[9]
#---Forward Regression Model----
fwd.model <- stepAIC(model.linear, direction = "forward", trace = F)
#summary(fwd.model)
AIC.fwd=AIC(fwd.model)
ADJrsq.fwd=summary(fwd.model)[9]
###Comparing model of linear Regression
#Comparing Linear model
compare1=matrix(c(AIC.linear, ADJrsq.linear,
AIC.step, ADJrsq.step,
AIC.back, ADJrsq.back,
AIC.fwd, ADJrsq.fwd),
ncol = 2,byrow=TRUE)
colnames(compare1)=c("AIC", "ADJRsq")
rownames(compare1)=c("Linear", "Stepwise", "Backward", "Foward")
compare1
## AIC ADJRsq
## Linear 2872.665 0.8981932
## Stepwise 2850.594 0.9048743
## Backward 2850.594 0.9048743
## Foward 2872.665 0.8981932
#Afterward, we'll just use the backward regression model variables. As it's been proven to offer a better adj rsq.
##Ridge
model.ridge<-lmridge(formula = price ~ normalized.losses + make + aspiration + drive.wheels + width + height + curb.weight + num.of.cylinders + engine.size + stroke + horsepower,df,scaling="centered")
#summary(model.ridge) #Get both adjrsq and aic from here.
AIC.ridge=2395.372
ADJrsq.ridge=0.9055
##Lasso
x= data.matrix(df[c("normalized.losses", "make", "aspiration", "drive.wheels", "width", "height", "curb.weight", "num.of.cylinders", "engine.size", "stroke", "horsepower")])
y= df$price
cv.l<-cv.glmnet(x,y,alpha=1);plot(cv.l)
best.ll<-cv.l$lambda.min
model.lasso<-glmnet(x,y,alpha=1,lambda=best.ll)
coefficients(model.lasso)
## 12 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -20303.190075
## normalized.losses -9.434832
## make 389.272561
## aspiration 1114.685514
## drive.wheels 757.171062
## width 394.151051
## height -124.769723
## curb.weight 2.810299
## num.of.cylinders 1317.924679
## engine.size 27.407401
## stroke -1532.128527
## horsepower 14.569490
wkwk=1-cv.l$cvm/var(y)
ADJrsq.lasso=wkwk[length(wkwk)]
ADJrsq.lasso
## [1] 0.8877881
AIC.lasso=NA #Couldn't be searched, even if it did the result'd be incorrect.
#Comparing linear, ridge, and lasso
#Comparing things
compare2=matrix(c(AIC.back,ADJrsq.back,
AIC.ridge, ADJrsq.ridge,
AIC.lasso, ADJrsq.lasso),
ncol = 2,byrow=TRUE)
colnames(compare2)=c("AIC", "ADJRsq")
rownames(compare2)=c("Linear (Backward)", "Ridge", "Lasso")
compare2 #Yuh ridge offers better versability
## AIC ADJRsq
## Linear (Backward) 2850.594 0.9048743
## Ridge 2395.372 0.9055
## Lasso NA 0.8877881
#Penjelasan Data cleaning dan encoding variable kategorik ke ordinal menggunakan Python. Pada tahap tersebut, dibandngkan model regresi linear normal dengan model regresi linear yang Xnya sudah dinormalisasi. Alhasil rsq yang didapatkan adalah sama.
Lalu dilanjutkan menggunakan R untuk membandingkan model linear, model ridge, dan model lasso.
Pada awalnya, dilakukan regresi dengan model linear dengan seluruh peubah bebas dan setelah itu dilakukan pembuatan model lainnya menggunakan reduksi dengan metode stepwise/foward/backward. Didapatkan metode backward dengan adj rsq dan aic lebih besar daripada model linear seluruh bebas. Sehingga peub ah bebas yang dipakai pada metode backward akan dipakai dan dilakukan uji dengan ridge dan lasso.
Peubah bebas yang dipakai: (“normalized.losses”, “make”, “aspiration”, “drive.wheels”, “width”, “height”, “curb.weight”, “num.of.cylinders”, “engine.size”, “stroke”, “horsepower”)]
Dari ketiga model baik linear, ridge, dan lasso. Didapatkan model terbaik menggunakan ridge dengan AIC 2395.372 dan ADJrsq 0.9055 Sehingga interpretasinya adalah sebagai berikut:
Price dug= -19309.3505+ -10.354normalized.losses+ 394.7738make+ 1128.6086aspiration+ 767.2547drive.wheels+ 394.9411width+ -139.8853height+ 2.8085curb.weight+ 1320.375num.of.cylinders+ 27.5969engine.size+ -1597.3331stroke+ 14.5399horsepower+