Get data and testing

'''
#For getting the cleaned data [Was run on vscode]
import pandas as pd
import numpy as np
import category_encoders as ce

colnames=["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price" ]
df=pd.read_csv("df/imports-85.data", header=None)
df.columns=colnames
len(df)

#Missing value denotated by "?"
#Removing missing value
temp=df[df!="?"]
df=temp.dropna()
len(df) #Removed 46 rows filled with missing value

df.head()
df.dtypes

#We'll just use the god damn categorical encoder, i think. Cuz why not?
#changing columns that's numerical but labelled object. And save colname with categorical
categoricalcolname=[]; Numericalcolname=[]
for i in colnames:
    try:
        df[i]=df[i].astype("float32")
        Numericalcolname.append(i)
    except:
        categoricalcolname.append(i) if df[i].dtype=="object" else 0
        
#df sorted by lowest -> highest price       (Cuz we'll be using 'Linear' regression, and this is a way to avoid encoding bias :sob:)
df=df.sort_values(by=["price"])

#get columns with categorical data
encoder = ce.OrdinalEncoder(cols=categoricalcolname)
df=encoder.fit_transform(df)
df=df.astype("float32")

pd.DataFrame.to_csv(df, "DF/cleaned.csv")


#Coba coba standarisasi

#testing
df=pd.read_csv("DF/cleaned.csv")
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale

X=df.iloc[:,1:]
y=X.pop("price")

#Normal regression
model=LinearRegression()
model.fit(X,y )
model.score(X,y)      #0.9136574938779927

#Standarized thing
Xscaled=scale(X)

model=LinearRegression()
model.fit(Xscaled,y )
model.score(Xscaled,y)    #0.9136574938779927

#Standarized the numericalcolname only
for i in Numericalcolname:
    df[i]=scale(df[i])

model=LinearRegression()
model.fit(Xscaled,y )
model.score(Xscaled,y)  #0.9136574938779927
'''
## '\n#For getting the cleaned data [Was run on vscode]\nimport pandas as pd\nimport numpy as np\nimport category_encoders as ce\n\ncolnames=["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price" ]\ndf=pd.read_csv("df/imports-85.data", header=None)\ndf.columns=colnames\nlen(df)\n\n#Missing value denotated by "?"\n#Removing missing value\ntemp=df[df!="?"]\ndf=temp.dropna()\nlen(df) #Removed 46 rows filled with missing value\n\ndf.head()\ndf.dtypes\n\n#We\'ll just use the god damn categorical encoder, i think. Cuz why not?\n#changing columns that\'s numerical but labelled object. And save colname with categorical\ncategoricalcolname=[]; Numericalcolname=[]\nfor i in colnames:\n    try:\n        df[i]=df[i].astype("float32")\n        Numericalcolname.append(i)\n    except:\n        categoricalcolname.append(i) if df[i].dtype=="object" else 0\n        \n#df sorted by lowest -> highest price       (Cuz we\'ll be using \'Linear\' regression, and this is a way to avoid encoding bias :sob:)\ndf=df.sort_values(by=["price"])\n\n#get columns with categorical data\nencoder = ce.OrdinalEncoder(cols=categoricalcolname)\ndf=encoder.fit_transform(df)\ndf=df.astype("float32")\n\npd.DataFrame.to_csv(df, "DF/cleaned.csv")\n\n\n#Coba coba standarisasi\n\n#testing\ndf=pd.read_csv("DF/cleaned.csv")\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.preprocessing import scale\n\nX=df.iloc[:,1:]\ny=X.pop("price")\n\n#Normal regression\nmodel=LinearRegression()\nmodel.fit(X,y )\nmodel.score(X,y)      #0.9136574938779927\n\n#Standarized thing\nXscaled=scale(X)\n\nmodel=LinearRegression()\nmodel.fit(Xscaled,y )\nmodel.score(Xscaled,y)    #0.9136574938779927\n\n#Standarized the numericalcolname only\nfor i in Numericalcolname:\n    df[i]=scale(df[i])\n\nmodel=LinearRegression()\nmodel.fit(Xscaled,y )\nmodel.score(Xscaled,y)  #0.9136574938779927\n'

R #Modules

library(lmtest)
library(dgof)
library(glmnet)
library(car)
library(PerformanceAnalytics)
library(lmridge)
library(MASS)

#Input data

df=read.csv("DF/cleaned.csv")
df=df[2:length(df)]
head(df)
##   symboling normalized.losses make fuel.type aspiration num.of.doors body.style
## 1         2                83    1         1          1            1          1
## 2         2               121    2         1          1            1          1
## 3         1               104    3         1          1            1          1
## 4         1                87    4         1          1            1          1
## 5         2               161    5         1          1            1          1
## 6         1               101    6         1          1            1          1
##   drive.wheels engine.location wheel.base length width height curb.weight
## 1            1               1       93.7  156.9  63.4   53.7        2050
## 2            1               1       88.4  141.1  60.3   53.2        1488
## 3            1               1       93.1  159.1  64.2   54.1        1890
## 4            1               1       95.7  158.7  63.6   54.5        1985
## 5            1               1       93.7  157.3  64.4   50.8        1918
## 6            1               1       93.7  150.0  64.0   52.6        1837
##   engine.type num.of.cylinders engine.size fuel.system bore stroke
## 1           1                1          97           1 3.62   2.36
## 2           2                2          61           1 2.91   3.03
## 3           3                1          91           1 3.03   3.15
## 4           3                1          92           1 3.05   3.03
## 5           3                1          92           1 2.97   3.23
## 6           3                1          79           2 2.91   3.07
##   compression.ratio horsepower peak.rpm city.mpg highway.mpg price
## 1               9.0         69     4900       31          36  5118
## 2               9.5         48     5100       47          53  5151
## 3               9.0         68     5000       30          31  5195
## 4               9.0         62     4800       35          39  5348
## 5               9.4         68     5500       37          41  5389
## 6              10.1         60     5500       38          42  5399
sum(is.na(df))
## [1] 0

#Model ##Linear Regression

#Normal linear regression
model.linear= lm(price~symboling+normalized.losses+ make+fuel.type+ aspiration+num.of.doors+ body.style+drive.wheels+ engine.location+wheel.base+ length+width+ height+curb.weight+ engine.type+num.of.cylinders+ engine.size+fuel.system+ bore+stroke+ compression.ratio+horsepower+ peak.rpm+city.mpg+ highway.mpg, data=df)

AIC.linear=AIC(model.linear)
ADJrsq.linear=summary(model.linear)[9]

###Stepwise/Backward/Foward

#---Stepwise Regression Model----
step.model <- stepAIC(model.linear, direction = "both", trace = F)
#summary(step.model)

AIC.step=AIC(step.model)
ADJrsq.step=summary(step.model)[9]

#---Backward Regression Model----
back.model <- stepAIC(model.linear, direction = "backward", trace =  F)
#summary(back.model)

AIC.back=AIC(back.model)
ADJrsq.back=summary(back.model)[9]

#---Forward Regression Model----
fwd.model <- stepAIC(model.linear, direction = "forward", trace = F)
#summary(fwd.model)

AIC.fwd=AIC(fwd.model)
ADJrsq.fwd=summary(fwd.model)[9]

###Comparing model of linear Regression

#Comparing Linear model
compare1=matrix(c(AIC.linear, ADJrsq.linear,
                  AIC.step, ADJrsq.step,
                  AIC.back, ADJrsq.back,
                  AIC.fwd, ADJrsq.fwd),
                ncol = 2,byrow=TRUE)
colnames(compare1)=c("AIC", "ADJRsq")
rownames(compare1)=c("Linear", "Stepwise", "Backward", "Foward")
compare1
##          AIC      ADJRsq   
## Linear   2872.665 0.8981932
## Stepwise 2850.594 0.9048743
## Backward 2850.594 0.9048743
## Foward   2872.665 0.8981932
#Afterward, we'll just use  the backward regression model variables. As it's been proven to offer a better adj rsq.

##Ridge

model.ridge<-lmridge(formula = price ~ normalized.losses + make + aspiration + drive.wheels + width + height + curb.weight + num.of.cylinders + engine.size + stroke + horsepower,df,scaling="centered")

#summary(model.ridge)    #Get both adjrsq and aic from here.
AIC.ridge=2395.372
ADJrsq.ridge=0.9055

##Lasso

x= data.matrix(df[c("normalized.losses", "make", "aspiration", "drive.wheels", "width", "height", "curb.weight", "num.of.cylinders", "engine.size", "stroke", "horsepower")])

y= df$price

cv.l<-cv.glmnet(x,y,alpha=1);plot(cv.l)

best.ll<-cv.l$lambda.min

model.lasso<-glmnet(x,y,alpha=1,lambda=best.ll)
coefficients(model.lasso)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                              s0
## (Intercept)       -20303.190075
## normalized.losses     -9.434832
## make                 389.272561
## aspiration          1114.685514
## drive.wheels         757.171062
## width                394.151051
## height              -124.769723
## curb.weight            2.810299
## num.of.cylinders    1317.924679
## engine.size           27.407401
## stroke             -1532.128527
## horsepower            14.569490
wkwk=1-cv.l$cvm/var(y)
ADJrsq.lasso=wkwk[length(wkwk)]
ADJrsq.lasso
## [1] 0.8877881
AIC.lasso=NA                   #Couldn't be searched, even if it did the result'd be incorrect.

#Comparing linear, ridge, and lasso

#Comparing things
compare2=matrix(c(AIC.back,ADJrsq.back,
                  AIC.ridge, ADJrsq.ridge,
                  AIC.lasso, ADJrsq.lasso),
                ncol = 2,byrow=TRUE)
colnames(compare2)=c("AIC", "ADJRsq")
rownames(compare2)=c("Linear (Backward)", "Ridge", "Lasso")
compare2        #Yuh ridge offers better versability
##                   AIC      ADJRsq   
## Linear (Backward) 2850.594 0.9048743
## Ridge             2395.372 0.9055   
## Lasso             NA       0.8877881

#Penjelasan Data cleaning dan encoding variable kategorik ke ordinal menggunakan Python. Pada tahap tersebut, dibandngkan model regresi linear normal dengan model regresi linear yang Xnya sudah dinormalisasi. Alhasil rsq yang didapatkan adalah sama.

Lalu dilanjutkan menggunakan R untuk membandingkan model linear, model ridge, dan model lasso.

Pada awalnya, dilakukan regresi dengan model linear dengan seluruh peubah bebas dan setelah itu dilakukan pembuatan model lainnya menggunakan reduksi dengan metode stepwise/foward/backward. Didapatkan metode backward dengan adj rsq dan aic lebih besar daripada model linear seluruh bebas. Sehingga peub ah bebas yang dipakai pada metode backward akan dipakai dan dilakukan uji dengan ridge dan lasso.

Peubah bebas yang dipakai: (“normalized.losses”, “make”, “aspiration”, “drive.wheels”, “width”, “height”, “curb.weight”, “num.of.cylinders”, “engine.size”, “stroke”, “horsepower”)]

Dari ketiga model baik linear, ridge, dan lasso. Didapatkan model terbaik menggunakan ridge dengan AIC 2395.372 dan ADJrsq 0.9055 Sehingga interpretasinya adalah sebagai berikut:

Price dug= -19309.3505+ -10.354normalized.losses+ 394.7738make+ 1128.6086aspiration+ 767.2547drive.wheels+ 394.9411width+ -139.8853height+ 2.8085curb.weight+ 1320.375num.of.cylinders+ 27.5969engine.size+ -1597.3331stroke+ 14.5399horsepower+