library(dplyr)
library(magrittr)
library(ggplot2)
#----------------------------- Question 1 ------------------------------
#-- Part A --
#read in the data file
airbnb_data <- read.csv(choose.files(), stringsAsFactors = TRUE)
#fitting the model
airbnb_model = lm(price ~ room_type + reviews + overall_satisfaction
+ accommodates + bedrooms, data = airbnb_data)
#summary of the model
summary(airbnb_model)
##
## Call:
## lm(formula = price ~ room_type + reviews + overall_satisfaction +
## accommodates + bedrooms, data = airbnb_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -367.8 -49.2 3.2 38.6 4032.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -23.36172 21.88618 -1.067 0.28609
## room_typePrivate room -0.93115 13.21827 -0.070 0.94386
## room_typeShared room -76.66780 59.90939 -1.280 0.20099
## reviews 0.01090 0.09982 0.109 0.91310
## overall_satisfaction -10.48160 3.47320 -3.018 0.00262 **
## accommodates 23.00721 5.23952 4.391 1.27e-05 ***
## bedrooms 85.64533 11.45983 7.474 1.95e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 167.1 on 847 degrees of freedom
## Multiple R-squared: 0.3228, Adjusted R-squared: 0.318
## F-statistic: 67.3 on 6 and 847 DF, p-value: < 2.2e-16
The statistically significant variables are:
bedrooms
overall_satisfaction
accommodates
#create the test data that we'll be predicting
price_test = data.frame(bedrooms = c(1), accommodates = c(2),
reviews = c(70), overall_satisfaction = c(4),
room_type = c('Private room'))
summary(price_test)
## bedrooms accommodates reviews overall_satisfaction room_type
## Min. :1 Min. :2 Min. :70 Min. :4 Length:1
## 1st Qu.:1 1st Qu.:2 1st Qu.:70 1st Qu.:4 Class :character
## Median :1 Median :2 Median :70 Median :4 Mode :character
## Mean :1 Mean :2 Mean :70 Mean :4
## 3rd Qu.:1 3rd Qu.:2 3rd Qu.:70 3rd Qu.:4
## Max. :1 Max. :2 Max. :70 Max. :4
#predict the price for the listing
predict(airbnb_model, price_test, interval = 'prediction')
## fit lwr upr
## 1 66.20316 -262.3605 394.7669
The predicted price is $66.20
#lets check the plot to see outliers. We're only showing the residual to show > 1
plot(airbnb_model)
#we can see 94 and 95 are outliers so lets remove them.
airbnb_data_2 <- airbnb_data[-c(94,95),]
#reruning the model for the new filtered data
airbnb_filtered_model <- lm(price ~ room_type + reviews + overall_satisfaction
+ accommodates + bedrooms, data = airbnb_data_2)
summary(airbnb_filtered_model)
##
## Call:
## lm(formula = price ~ room_type + reviews + overall_satisfaction +
## accommodates + bedrooms, data = airbnb_data_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -190.95 -32.43 -7.09 20.35 876.26
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.01310 9.09152 8.251 6.01e-16 ***
## room_typePrivate room -32.28201 5.38034 -6.000 2.92e-09 ***
## room_typeShared room -91.69951 24.28958 -3.775 0.000171 ***
## reviews -0.05915 0.04047 -1.462 0.144202
## overall_satisfaction -6.78957 1.41118 -4.811 1.78e-06 ***
## accommodates 11.90698 2.14267 5.557 3.68e-08 ***
## bedrooms 35.93177 4.87968 7.364 4.25e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.73 on 845 degrees of freedom
## Multiple R-squared: 0.4249, Adjusted R-squared: 0.4208
## F-statistic: 104 on 6 and 845 DF, p-value: < 2.2e-16
#lets see the new plot
plot(airbnb_filtered_model)
#read in the data file
dirmark_data <- read.csv(choose.files(), stringsAsFactors = TRUE)
summary(dirmark_data)
## Age Gender OwnHome Married Location
## Middle:508 Female:506 Own :516 Married:502 Close:710
## Old :205 Male :494 Rent:484 Single :498 Far :290
## Young :287
##
##
##
## Salary Children History Catalogs AmountSpent
## Min. : 10100 Min. :0.000 High :255 Min. : 6.00 Min. : 3.80
## 1st Qu.: 29975 1st Qu.:0.000 Low :230 1st Qu.: 6.00 1st Qu.: 48.83
## Median : 53700 Median :1.000 Medium:212 Median :12.00 Median : 96.20
## Mean : 56104 Mean :0.934 None :303 Mean :14.68 Mean :121.68
## 3rd Qu.: 77025 3rd Qu.:2.000 3rd Qu.:18.00 3rd Qu.:168.85
## Max. :168800 Max. :3.000 Max. :24.00 Max. :621.70
dirmark_data<- dirmark_data %>% mutate(Low = ifelse(History == 'Low', 1, 0)) %>%
mutate(Medium = ifelse(History== 'Medium', 1, 0)) %>%
mutate(High = ifelse(History=='High', 1, 0)) %>%
mutate(LowSalary = Salary*Low) %>% mutate(MediumSalary = Salary*Medium) %>%
mutate(HighSalary = Salary*High)
str(dirmark_data)
## 'data.frame': 1000 obs. of 16 variables:
## $ Age : Factor w/ 3 levels "Middle","Old",..: 2 1 3 1 1 3 1 1 1 2 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 1 2 1 2 1 2 1 2 ...
## $ OwnHome : Factor w/ 2 levels "Own","Rent": 1 2 2 1 1 1 2 1 1 1 ...
## $ Married : Factor w/ 2 levels "Married","Single": 2 2 2 1 2 1 2 2 1 1 ...
## $ Location : Factor w/ 2 levels "Close","Far": 2 1 1 1 1 1 1 1 1 2 ...
## $ Salary : int 47500 63600 13500 85600 68400 30400 48100 68400 51900 80700 ...
## $ Children : int 0 0 0 1 0 0 0 0 3 0 ...
## $ History : Factor w/ 4 levels "High","Low","Medium",..: 1 1 2 1 1 2 3 1 2 4 ...
## $ Catalogs : int 6 6 18 18 12 6 12 18 6 18 ...
## $ AmountSpent : num 75.5 131.8 29.6 243.6 130.4 ...
## $ Low : num 0 0 1 0 0 1 0 0 1 0 ...
## $ Medium : num 0 0 0 0 0 0 1 0 0 0 ...
## $ High : num 1 1 0 1 1 0 0 1 0 0 ...
## $ LowSalary : num 0 0 13500 0 0 30400 0 0 51900 0 ...
## $ MediumSalary: num 0 0 0 0 0 0 48100 0 0 0 ...
## $ HighSalary : num 47500 63600 0 85600 68400 0 0 68400 0 0 ...
#create the model
dirmark_model <- lm(AmountSpent ~ Low + Medium + High + LowSalary +
MediumSalary + HighSalary, data = dirmark_data)
summary(dirmark_model)
##
## Call:
## lm(formula = AmountSpent ~ Low + Medium + High + LowSalary +
## MediumSalary + HighSalary, data = dirmark_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -214.33 -35.19 -7.49 25.17 374.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.240e+02 3.912e+00 31.694 < 2e-16 ***
## Low -9.658e+01 8.548e+00 -11.299 < 2e-16 ***
## Medium -4.273e+01 1.423e+01 -3.004 0.00274 **
## High -4.935e+01 1.732e+01 -2.850 0.00447 **
## LowSalary 2.573e-04 1.901e-04 1.354 0.17620
## MediumSalary 2.488e-04 2.321e-04 1.072 0.28397
## HighSalary 1.723e-03 1.954e-04 8.820 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.1 on 993 degrees of freedom
## Multiple R-squared: 0.501, Adjusted R-squared: 0.498
## F-statistic: 166.1 on 6 and 993 DF, p-value: < 2.2e-16
#check salary summary to see where $10,000 might fall.
summary(dirmark_data$Salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10100 29975 53700 56104 77025 168800
#thanks to salary we can see the min is 10100, so 10000 falls in low.
#We're going to create test data based on low, and predict the model.
test_lowSal <- data.frame(Low = 1, Medium = 0, High = 0, LowSalary = 10000,
MediumSalary = 0, HighSalary = 0)
predict(dirmark_model, test_lowSal, interval = 'prediction')
## fit lwr upr
## 1 29.98157 -104.1962 164.1594
#Just in case, We'll do the same thing for medium, and high
test_mediumSal <- data.frame(Low = 0, Medium = 1, High = 0, LowSalary = 0,
MediumSalary = 10000, HighSalary = 0)
predict(dirmark_model, test_mediumSal, interval = 'prediction')
## fit lwr upr
## 1 83.74909 -51.78138 219.2796
test_highSal <- data.frame(Low = 0, Medium = 0, High = 1, LowSalary = 0,
MediumSalary = 0, HighSalary = 10000)
predict(dirmark_model, test_highSal, interval = 'prediction')
## fit lwr upr
## 1 91.86874 -44.96113 228.6986
The amount spent through each category is as follows:
#read in the data file
airbnb_data2 <- read.csv(choose.files(), stringsAsFactors = TRUE)
summary(airbnb_data2)
## room_id survey_id host_id room_type
## Min. : 67870 Min. :1498 Min. : 62667 Entire home/apt:512
## 1st Qu.: 6413734 1st Qu.:1498 1st Qu.: 6453926 Private room :334
## Median :13329838 Median :1498 Median : 22920130 Shared room : 8
## Mean :11672573 Mean :1498 Mean : 37877449
## 3rd Qu.:16856088 3rd Qu.:1498 3rd Qu.: 58634762
## Max. :19912932 Max. :1498 Max. :141036151
## city reviews overall_satisfaction accommodates
## Asheville:854 Min. : 0.00 Min. :0.00 Min. : 1.000
## 1st Qu.: 8.00 1st Qu.:4.50 1st Qu.: 2.000
## Median : 28.00 Median :5.00 Median : 3.000
## Mean : 49.11 Mean :4.18 Mean : 3.412
## 3rd Qu.: 65.00 3rd Qu.:5.00 3rd Qu.: 4.000
## Max. :602.00 Max. :5.00 Max. :17.000
## bedrooms price
## Min. : 0.000 Min. : 20.0
## 1st Qu.: 1.000 1st Qu.: 70.0
## Median : 1.000 Median : 95.0
## Mean : 1.352 Mean : 126.6
## 3rd Qu.: 2.000 3rd Qu.: 139.0
## Max. :10.000 Max. :5000.0
#find the min to make sure the values are all positive.
min(airbnb_data2$overall_satisfaction)
## [1] 0
#add 1 due to the min being zero
airbnb_new <- airbnb_data2 %>% mutate(ov_new = overall_satisfaction +1)
airbnb_ln <- airbnb_new %>% mutate(log_price = log(price)) %>%
mutate(log_ov_sat = log(ov_new))
head(airbnb_ln)
#linear-linear
airbnb_linlin = lm(price ~ overall_satisfaction, data = airbnb_ln)
summary(airbnb_linlin)
##
## Call:
## lm(formula = price ~ overall_satisfaction, data = airbnb_ln)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.0 -51.3 -24.2 16.8 4805.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 194.967 17.698 11.016 < 2e-16 ***
## overall_satisfaction -16.353 3.903 -4.189 3.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 200.4 on 852 degrees of freedom
## Multiple R-squared: 0.02018, Adjusted R-squared: 0.01903
## F-statistic: 17.55 on 1 and 852 DF, p-value: 3.088e-05
#linear-log
airbnb_linlog = lm(price ~ log_ov_sat, data = airbnb_ln)
summary(airbnb_linlog)
##
## Call:
## lm(formula = price ~ log_ov_sat, data = airbnb_ln)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.5 -50.7 -24.7 16.3 4803.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 196.46 17.76 11.062 < 2e-16 ***
## log_ov_sat -46.20 10.84 -4.263 2.24e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 200.4 on 852 degrees of freedom
## Multiple R-squared: 0.02089, Adjusted R-squared: 0.01974
## F-statistic: 18.18 on 1 and 852 DF, p-value: 2.239e-05
#log-linear
airbnb_loglin = lm(log_price ~ overall_satisfaction, data = airbnb_ln)
summary(airbnb_loglin)
##
## Call:
## lm(formula = log_price ~ overall_satisfaction, data = airbnb_ln)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6234 -0.3525 -0.0432 0.3302 3.7220
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.79515 0.05083 94.339 < 2e-16 ***
## overall_satisfaction -0.04401 0.01121 -3.926 9.33e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5757 on 852 degrees of freedom
## Multiple R-squared: 0.01777, Adjusted R-squared: 0.01662
## F-statistic: 15.41 on 1 and 852 DF, p-value: 9.331e-05
#log-log regression
airbnb_loglog = lm(log_price ~ log_ov_sat, data = airbnb_ln)
summary(airbnb_loglog)
##
## Call:
## lm(formula = log_price ~ log_ov_sat, data = airbnb_ln)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6030 -0.3551 -0.0327 0.3298 3.7132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.80396 0.05098 94.228 < 2e-16 ***
## log_ov_sat -0.12750 0.03111 -4.099 4.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5752 on 852 degrees of freedom
## Multiple R-squared: 0.01934, Adjusted R-squared: 0.01819
## F-statistic: 16.8 on 1 and 852 DF, p-value: 4.547e-05
#finding out the best R^2
summary(airbnb_linlin)$r.squared
## [1] 0.02018428
summary(airbnb_linlog)$r.squared
## [1] 0.02088739
summary(airbnb_loglin)$r.squared
## [1] 0.01777027
summary(airbnb_loglog)$r.squared
## [1] 0.01933861
Linear_log seems the be the best R^2 model, as its the closest to 1.
titanic_data <- read.csv(choose.files(), stringsAsFactors = TRUE)
#convert the survived variable to be a factor with two levels
titanic_data$Survived <- as.factor(titanic_data$Survived)
str(titanic_data)
## 'data.frame': 756 obs. of 5 variables:
## $ Name : Factor w/ 753 levels "Abbing, Mr Anthony",..: 22 25 26 27 24 31 44 45 49 53 ...
## $ PClass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 29 2 30 25 0.92 47 63 39 58 71 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 1 2 1 2 2 1 2 1 2 ...
## $ Survived: Factor w/ 2 levels "0","1": 2 1 1 1 2 2 2 1 2 1 ...
#performing logistic regression
titanic_glm <- glm(Survived ~ Sex , data = titanic_data, family = "binomial")
summary(titanic_glm)
##
## Call:
## glm(formula = Survived ~ Sex, family = "binomial", data = titanic_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6735 -0.6776 -0.6776 0.7524 1.7800
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1172 0.1367 8.171 3.05e-16 ***
## Sexmale -2.4718 0.1783 -13.861 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1025.57 on 755 degrees of freedom
## Residual deviance: 796.64 on 754 degrees of freedom
## AIC: 800.64
##
## Number of Fisher Scoring iterations: 4
The intercept coefficient b0 is 1.172. This represents Ln(odds of sex female).
fem_surv <- data.frame(Sex="female")
predict(titanic_glm, fem_surv, type="response")
## 1
## 0.7534722
The probability of survival for females is 75.35%
male_surv <- data.frame(Sex="male")
predict(titanic_glm, male_surv, type="response")
## 1
## 0.2051282
The probability of survival for males is 20.51%