Part I

#Data Selection from Kaggle
UV <- read.csv("/Users/pin.lyu/Desktop/BC_Class_Folder/Econometrics/DIS_&_ASSIGNMENT/DIS_3/2ndhandcar.csv")

Data description:

This data is scrape from https://www.carwale.com/ which is a popular platform in India for buying and selling cars. CarWale is a website and app that provides information about new and used cars in India. Users can research and compare different models, read reviews, and find dealerships in their area. And this is a cross-sectional data set.

This data set contains information on used cars for sale, including model name, year of manufacturing, Km driven, price, and fuel type.

  • car_name = Brand of the Auto-maker
  • car_price_in_rupees = Price of the vehicle when purchased as used
  • kms_driven = Milage on the car when purchased as used
  • fuel_type = Type of fuel used (Diesel/Gasoline)
  • city = Where the vehicle was purchased
  • year_of_manufacture = When the vehicle was manufactured

Part II

\[ Price = \beta_0 +\beta_1Milage+\beta_2Year+\epsilon \]

Part III

# Check empty values in all variables
colSums(is.na(UV))
##            car_name car_price_in_rupees          kms_driven           fuel_type 
##                   0                   0                   0                   0 
##                city year_of_manufacture 
##                   0                   0
# Make a copy of the original data 
UV2 <- UV

# Drop the units in "car_price"
UV2$car_price_in_rupees <- gsub("Lakh","",as.character(UV2$car_price_in_rupees))
UV2$car_price_in_rupees <- gsub("Crore","",as.character(UV2$car_price_in_rupees))
UV2$car_price_in_rupees <- gsub("₹","",as.character(UV2$car_price_in_rupees))

# Drop the units in "kms_driven"
UV2$kms_driven <- gsub("km","",as.character(UV2$kms_driven))
UV2$kms_driven <- gsub(",","",as.character(UV2$kms_driven))
# Change data form from character to numeric
UV2$car_price_in_rupees <- as.numeric(UV2$car_price_in_rupees)
## Warning: NAs introduced by coercion
UV2$kms_driven <- as.numeric(UV2$kms_driven)
# Finding missing values
sum(is.na(UV2$car_price_in_rupees))
## [1] 2
sum(is.na(UV2$kms_driven))
## [1] 0
sum(is.na(UV2$year_of_manufacture))
## [1] 0
# Filling the missing values in price with median. 
UV2$car_price_in_rupees[is.na(UV2$car_price_in_rupees)] <- median(UV2$car_price_in_rupees, na.rm=TRUE)

Part IV

# Make the year_manufacture variable smaller so that R can run multilinear regression with following two matrices

UV2$year_of_manufacture <- UV2$year_of_manufacture - 2000
# Generate coefficient variables into a matrix
k <- as.matrix(UV2$kms_driven)
k <- cbind (k, UV2$year_of_manufacture)
k <- cbind(1,k)

head(k, n = 5)
##      [,1]  [,2] [,3]
## [1,]    1 22402   16
## [2,]    1 10344   19
## [3,]    1 12999   21
## [4,]    1 45000   16
## [5,]    1 11193   19
# Generate the Y-variable into a matrix  
p <- as.matrix(UV2$car_price_in_rupees)

head(p, n = 5)
##       [,1]
## [1,]  4.45
## [2,]  2.93
## [3,] 22.49
## [4,]  6.95
## [5,] 12.00
# Dimension check 
dim(k)
## [1] 2105    3
dim(p)
## [1] 2105    1
# Matrix 
Coefficients <- solve(t(k) %*% k) %*% t(k) %*% p
Coefficients
##               [,1]
## [1,] -1.037078e+01
## [2,]  7.064171e-06
## [3,]  1.235171e+00

Part V

# Coefficients
result <- lm(car_price_in_rupees ~ kms_driven + year_of_manufacture, data = UV2)
summary(result)$coefficient
##                          Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)         -1.037078e+01 1.5672124145 -6.617343 4.626618e-11
## kms_driven           7.064171e-06 0.0000029989  2.355588 1.858483e-02
## year_of_manufacture  1.235171e+00 0.0890927650 13.863878 6.900323e-42
beta1 <- cov(UV2$car_price_in_rupees, UV2$kms_driven)/var(UV2$kms_driven)
beta2 <- cov(UV2$car_price_in_rupees, UV2$year_of_manufacture)/var(UV2$year_of_manufacture)

round(beta1, digits = 3)
## [1] 0
round(beta2, digits = 3)
## [1] 1.2

Part VI