library(car)
library("ggplot2")
# this code basically installed packages if not installed already and load the mentioned packages
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load("moments","extRemes","stringi", "ggplot2")
#set working dir
setwd("D:\\Google Drive\\SA\\Assignment 2")
# read data from csv file
housepricedf = read.csv("houseprices.csv", header = TRUE)
Solution 1a : We formulate over null hypothesis that mu is <= $150k
# mu given
mu <- 150000
# read data from csv file
housepricedf = read.csv("houseprices.csv", header = TRUE)
# price
price <- housepricedf$Price
sampleMeanPrice <- round(mean(price))
# take length of any column
samplesize <- length(price)
# sample std deviation
sdprice <- sd(price)
# degree of freedom
df <- samplesize-1
# alpha value or significance level
alpha_value <- 0.05
# calculate p-value, we are subtracting bcos we need to calculate right tail prob
p_value_1a <- 1-pt((sampleMeanPrice-mu)*sqrt(samplesize)/sdprice, df)
## P-value is 2.68243205425733e-11
## Since p-value is less then alpha value we reject our null hypothesis and hence avg price of house may be greater than $150k
Solution 1b : We formulate over null hypothesis that living area mu is <= 1800 SFT
mu <- 1800
# Living are vector
livingarea <- housepricedf$Living.Area
# sd of living area
sdlivingarea <- sd (livingarea)
samplemeanLivingArea = mean(livingarea)
p_value_1b <- 1-pt((samplemeanLivingArea-mu)*sqrt(samplesize)/sdlivingarea, df)
## P-value is 0.356333895962788
## Since p-value is greater or equal to alpha value our null hypothesis is true and hence living area of house may not be greater than $1800 SFT
2a) Create side-by-side box plots of the house prices of the two groups and comment them. (2marks)
# subset of data frame where fire place is available
hp_fp <- subset(housepricedf, as.numeric(housepricedf$Fireplace) == 1)
# subset of data frame where fire place is not available
hp_nfp <- subset(housepricedf, as.numeric(housepricedf$Fireplace) == 0)
By looking box plots its look likes that claim that houses which are having fireplace seems to be more expensive.
2b) Formulate an appropriate hypothesis and test it in order to check the above claim. Assume that the population standard deviations of house prices in the two groups are equal. (1mark)
Solution 2b : Our null hypothesis that average price of houses does not depends upon fireplace, means mu(house with fireplace) <= mu(house without fireplace. As it is given Sd of both types of houses are same so it implies all given houses has same standard deviation.
sample_mean_fp <- mean(hp_fp$Price)
Sample_mean_nfp <- mean(hp_nfp$Price)
# since SD are same for both population group, we can take sd of pooled sd of both samples
temp1=(nrow(hp_fp)-1) * sd(hp_fp$Price) * sd(hp_fp$Price) + (nrow(hp_nfp)-1) * sd(hp_nfp$Price) * sd(hp_nfp$Price)
temp2 = nrow(hp_fp) + nrow(hp_nfp) -2
pool_sd = sqrt(temp1/temp2)
#degree of freedom in this case
df <- nrow(hp_nfp) + nrow(hp_fp) - 2
# p value of houses which has fireplace
p_value <- pt((mean(hp_nfp$Price) - mean(hp_fp$Price))/(pool_sd * sqrt((1/nrow(hp_fp)) + (1/nrow(hp_fp)) )), df)
## p_value 4.33812960990951e-67 is less than 0.05% of significant level, we reject our null hypothesis and accepts claim that house prices with fire place are expensive as comapre to houses which has not.
oldhouses <- subset(housepricedf,as.numeric(housepricedf$Age) <= 30)
newhouses <- subset(housepricedf,as.numeric(housepricedf$Age) > 30)
mean_size_oldhouses <- mean(oldhouses$Lot.Size)
mean_size_newhouses <- mean(newhouses$Lot.Size)
sd_oh <- sd(oldhouses$Lot.Size)
sd_nh <- sd(newhouses$Lot.Size)
# As per problem statement we have variance or sigma which is different for both of the samples
# our Null hypothesis where mu_nh - mu_oh <= 0
t.test(newhouses$Lot.Size,oldhouses$Lot.Size,var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: newhouses$Lot.Size and oldhouses$Lot.Size
## t = -0.59026, df = 610.28, p-value = 0.5552
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.13014310 0.06999065
## sample estimates:
## mean of x mean of y
## 0.5481788 0.5782550
temp1 <- sd_oh * sd_oh / nrow(oldhouses) + sd_nh * sd_nh / nrow(newhouses)
temp1 <- temp1*temp1
temp2 <- (sd_oh * sd_oh)/nrow(oldhouses) * (sd_oh * sd_oh)/nrow(oldhouses)/(nrow(oldhouses)-1) + (sd_nh * sd_nh)/nrow(newhouses) * (sd_nh * sd_nh)/nrow(newhouses)/(nrow(newhouses)-1)
df_3 <- temp1/temp2
p_value <- pt((mean_size_newhouses-mean_size_oldhouses)/sqrt(sd_oh*sd_oh/nrow(oldhouses) + sd_nh*sd_nh/nrow(newhouses)),df_3)
## p_value 0.277617477460418 is greater than 0.05% of significant level, we approve our null hypothesis and reject claim that old house has larger lot size comapre to new houses
# num of old houses = 745, num of old houses with fireplace = 486, Proportion of old houses with fireplace
# = 0.65235
# num of new houses = 302, num of new houses with fireplace = 135, Proportion of new houses with fireplace
# = 0.44702
# Null hypothesis --> H(null) ---> pi(o) - pi(n) <= 0
p_oh_fp <- nrow(subset(oldhouses,as.numeric(oldhouses$Fireplace) == 1))/nrow(oldhouses)
p_nh_fp <- nrow(subset(newhouses,as.numeric(newhouses$Fireplace) == 1))/nrow(newhouses)
weighted_p <- (p_oh_fp * nrow(oldhouses) + p_nh_fp*nrow(newhouses))/ (nrow(newhouses) +nrow(oldhouses))
z = (p_oh_fp-p_nh_fp)/sqrt(weighted_p* (1-weighted_p)*((1/nrow(oldhouses)) + ((1/nrow(newhouses)) )))
p_value <- pnorm(z)
## p_value 0.999999999552521 is greater than 0.05% of significant level, we cannot reject our null hypothesis and reject claim that fireplaces in new house have become more fashionable
# our null hypothesis in this case is mu_small = mu_med = mu_big
smallhouses <- subset(housepricedf, housepricedf$Bedrooms < 3)
smallhouses$size <- 'small'
#View(smallhouses)
medhouses <- subset(housepricedf, housepricedf$Bedrooms > 2 & housepricedf$Bedrooms <= 4)
medhouses$size <- 'medium'
#View(medhouses)
bighouses <- subset(housepricedf, housepricedf$Bedrooms > 4)
bighouses$size <- 'big'
#View(bighouses)
temp <- rbind(smallhouses,medhouses)
housedataframe <- rbind(temp,bighouses)
#View(housedataframe)
anova <- aov(housedataframe$Price~housedataframe$size)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## housedataframe$size 2 4.840e+11 2.420e+11 58.71 <2e-16 ***
## Residuals 1044 4.303e+12 4.122e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## From anova Calulcation we have p_value which is close to 0 and hence less then 1% significance level, so rejecting our null hypothesis and conclude that size that the prices of Small, Medium and Big houses are not the same