#Data Selection from Kaggle
UV <- read.csv("/Users/pin.lyu/Desktop/BC_Class_Folder/Econometrics/DIS_&_ASSIGNMENT/DIS_3/2ndhandcar.csv")
This data is scrape from https://www.carwale.com/ which is a popular platform in India for buying and selling cars. CarWale is a website and app that provides information about new and used cars in India. Users can research and compare different models, read reviews, and find dealerships in their area. And this is a cross-sectional data set.
This data set contains information on used cars for sale, including model name, year of manufacturing, Km driven, price, and fuel type.
\[ Price = \beta_0 +\beta_1Milage+\beta_2Year+\epsilon \]
# Check empty values in all variables
colSums(is.na(UV))
## car_name car_price_in_rupees kms_driven fuel_type
## 0 0 0 0
## city year_of_manufacture
## 0 0
# Make a copy of the original data
UV2 <- UV
# Drop the units in "car_price"
UV2$car_price_in_rupees <- gsub("Lakh","",as.character(UV2$car_price_in_rupees))
UV2$car_price_in_rupees <- gsub("Crore","",as.character(UV2$car_price_in_rupees))
UV2$car_price_in_rupees <- gsub("₹","",as.character(UV2$car_price_in_rupees))
# Drop the units in "kms_driven"
UV2$kms_driven <- gsub("km","",as.character(UV2$kms_driven))
UV2$kms_driven <- gsub(",","",as.character(UV2$kms_driven))
# Change data form from character to numeric
UV2$car_price_in_rupees <- as.numeric(UV2$car_price_in_rupees)
## Warning: NAs introduced by coercion
UV2$kms_driven <- as.numeric(UV2$kms_driven)
# Finding missing values
sum(is.na(UV2$car_price_in_rupees))
## [1] 2
sum(is.na(UV2$kms_driven))
## [1] 0
sum(is.na(UV2$year_of_manufacture))
## [1] 0
# Filling the missing values in price with median.
UV2$car_price_in_rupees[is.na(UV2$car_price_in_rupees)] <- median(UV2$car_price_in_rupees, na.rm=TRUE)
# Make the year_manufacture variable smaller so that R can run multilinear regression with following two matrices
UV2$year_of_manufacture <- UV2$year_of_manufacture - 2000
# Generate coefficient variables into a matrix
k <- as.matrix(UV2$kms_driven)
k <- cbind (k, UV2$year_of_manufacture)
k <- cbind(1,k)
head(k, n = 5)
## [,1] [,2] [,3]
## [1,] 1 22402 16
## [2,] 1 10344 19
## [3,] 1 12999 21
## [4,] 1 45000 16
## [5,] 1 11193 19
# Generate the Y-variable into a matrix
p <- as.matrix(UV2$car_price_in_rupees)
head(p, n = 5)
## [,1]
## [1,] 4.45
## [2,] 2.93
## [3,] 22.49
## [4,] 6.95
## [5,] 12.00
# Dimension check
dim(k)
## [1] 2105 3
dim(p)
## [1] 2105 1
# Matrix
Coefficients <- solve(t(k) %*% k) %*% t(k) %*% p
Coefficients
## [,1]
## [1,] -1.037078e+01
## [2,] 7.064171e-06
## [3,] 1.235171e+00
# Coefficients
result <- lm(car_price_in_rupees ~ kms_driven + year_of_manufacture, data = UV2)
summary(result)$coefficient
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.037078e+01 1.5672124145 -6.617343 4.626618e-11
## kms_driven 7.064171e-06 0.0000029989 2.355588 1.858483e-02
## year_of_manufacture 1.235171e+00 0.0890927650 13.863878 6.900323e-42
beta1 <- cov(UV2$car_price_in_rupees, UV2$kms_driven)/var(UV2$kms_driven)
beta2 <- cov(UV2$car_price_in_rupees, UV2$year_of_manufacture)/var(UV2$year_of_manufacture)
round(beta1, digits = 3)
## [1] 0
round(beta2, digits = 3)
## [1] 1.2
Summary