# Set the working directory
setwd("C:/Users/kaza_/OneDrive/IIMK Project")
trainData <- read.csv("Data/train.csv", header = TRUE, sep = ",")
str(trainData)
## 'data.frame': 6399 obs. of 30 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ car_name : chr "maruti swift" "maruti alto 800" "hyundai grand i10" "maruti swift" ...
## $ yr_mfr : int 2015 2016 2017 2013 2015 2018 2012 2012 2014 2018 ...
## $ fuel_type : chr "petrol" "petrol" "petrol" "diesel" ...
## $ kms_run : int 8063 23104 23402 39124 22116 23534 41213 38328 56402 32703 ...
## $ sale_price : int 386399 265499 477699 307999 361499 335299 281999 321499 456199 281299 ...
## $ city : chr "noida" "noida" "noida" "noida" ...
## $ times_viewed : int 18715 2676 609 6511 3225 1055 909 2760 2475 2497 ...
## $ body_type : chr "hatchback" "hatchback" "hatchback" "hatchback" ...
## $ transmission : chr "manual" "manual" "manual" "manual" ...
## $ variant : chr "lxi opt" "lxi" "sports 1.2 vtvt" "vdi" ...
## $ assured_buy : chr "True" "True" "True" "True" ...
## $ registered_city : chr "delhi" "noida" "agra" "delhi" ...
## $ registered_state : chr "delhi" "uttar pradesh" "uttar pradesh" "delhi" ...
## $ is_hot : chr "True" "True" "True" "True" ...
## $ rto : chr "dl6c" "up16" "up80" "dl1c" ...
## $ source : chr "inperson_sale" "inperson_sale" "inperson_sale" "inperson_sale" ...
## $ make : chr "maruti" "maruti" "hyundai" "maruti" ...
## $ model : chr "swift" "alto 800" "grand i10" "swift" ...
## $ car_availability : chr "in_stock" "in_stock" "in_stock" "in_stock" ...
## $ total_owners : int 2 1 1 1 1 1 1 3 1 1 ...
## $ broker_quote : int 397677 272935 469605 294262 360716 343212 201200 319200 452023 264597 ...
## $ original_price : num 404177 354313 NA 374326 367216 ...
## $ car_rating : chr "great" "great" "great" "great" ...
## $ ad_created_on : chr "2021-04-04T07:09:18.583" "2021-03-22T14:07:32.833" "2021-03-20T05:36:31.311" "2021-01-21T12:59:19.299" ...
## $ fitness_certificate: chr "True" "True" "True" "True" ...
## $ emi_starts_from : int 8975 6167 11096 7154 8397 7788 6550 7468 10596 6534 ...
## $ booking_down_pymnt : int 57960 39825 71655 46200 54225 50295 42300 48225 68430 42195 ...
## $ reserved : chr "False" "False" "False" "False" ...
## $ warranty_avail : chr "False" "False" "False" "False" ...
## Add new column based on fuel type as factor
trainData$FUEL_TYPE_F <- factor(trainData$fuel_type, levels = c("petrol", "diesel","electric","petrol & cng","petrol & lpg"), labels = c("1","0","0","0","0"))
table_ft <- table (trainData$FUEL_TYPE_F)
prop.table(table_ft)*100
##
## 1 0
## 62.52539 37.47461
se <- sqrt ((.5*.5/nrow(trainData)))
z_stat1 <- (.625 - .50) / se
z_stat1
## [1] 19.99844
HYP2 <- data.frame(TV = trainData$times_viewed, BT = trainData$body_type)
HYP2$BT_F <- factor(HYP2$BT)
anova <- aov(TV~BT_F, data = HYP2)
model.tables (anova, type = "means")
## Tables of means
## Grand mean
##
## 1534.861
##
## BT_F
## hatchback luxury sedan luxury suv sedan suv
## 1803 1450 2607 2332 1458 1658
## rep 90 3798 135 169 1261 946
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## BT_F 5 3.184e+08 63673067 16.68 <2e-16 ***
## Residuals 6393 2.440e+10 3816224
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
HYP3 <- data.frame(SP = trainData$sale_price, BT = trainData$body_type)
HYP3$BT_F <- factor(HYP3$BT)
HYP3 <- subset(HYP3,BT == "hatchback"|BT == "suv")
HYP_TEST <- t.test(SP ~ BT_F, data = HYP3, alternative = "less", var.equal = TRUE)
HYP_TEST
##
## Two Sample t-test
##
## data: SP by BT_F
## t = -53.752, df = 4742, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -337400.8
## sample estimates:
## mean in group hatchback mean in group suv
## 347860.3 695913.8