Task: Using R, build a multiple regression model for data that interests you. Include in this model at least one quadratic term, one dichotomous term, and one dichotomous vs. quantitative interaction term. Interpret all coefficients. Conduct residual analysis. Was the linear model appropriate? Why or why not?
library(RCurl)
## Loading required package: bitops
library(ggplot2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
medicalappointment <- getURL("https://raw.githubusercontent.com/jgarcia71/Data-605-Assignments/master/Medical%20Appointment.csv")
medicalappointment_data <- read.csv(text = medicalappointment, stringsAsFactors = FALSE)
medicalappointment_data$Gender <- factor(medicalappointment_data$Gender, levels = c("M", "F"))
head(medicalappointment_data)
## PatientId AppointmentID Gender ScheduledDay AppointmentDay
## 1 9.60e+13 5595387 F 2016-04-18T12:36:04Z 2016-05-05T00:00:00Z
## 2 5.27e+13 5647604 F 2016-05-02T14:01:33Z 2016-05-09T00:00:00Z
## 3 5.27e+13 5681949 F 2016-05-10T15:27:23Z 2016-05-12T00:00:00Z
## 4 4.67e+12 5668760 M 2016-05-06T09:41:40Z 2016-05-06T00:00:00Z
## 5 7.24e+12 5427853 F 2016-03-03T15:17:12Z 2016-05-17T00:00:00Z
## 6 9.82e+12 5768785 F 2016-06-03T08:01:28Z 2016-06-03T00:00:00Z
## Age Neighbourhood Scholarship Hipertension Diabetes Alcoholism Handcap
## 1 37 QUITO 0 0 0 0 0
## 2 58 QUITO 0 0 0 0 0
## 3 58 QUITO 0 0 0 0 0
## 4 51 QUITO 0 0 0 0 0
## 5 64 QUITO 0 1 0 0 0
## 6 36 QUITO 0 0 0 0 0
## SMS_received Medicalappt.show
## 1 0 Yes
## 2 0 No
## 3 0 No
## 4 0 No
## 5 0 No
## 6 0 No
status_table <- table(medicalappointment_data$Medicalappt.show)
status_table
##
## No Yes
## 88208 22319
ggplot(medicalappointment_data, aes(x=Medicalappt.show, fill=Medicalappt.show)) + geom_bar(colour="black") +
scale_fill_manual(values=c("#009E73", "#E69F00"))
medicalappointment_revision <- medicalappointment_data %>% select(c("Gender", "Age", "Scholarship", "Hipertension", "Diabetes", "Alcoholism", "Handcap", "SMS_received", "Medicalappt.show"))
medicalappointment_revision[medicalappointment_revision$Medicalappt.show== "No",]$Medicalappt.show = 0
medicalappointment_revision[medicalappointment_revision$Medicalappt.show == "Yes",]$Medicalappt.show= 1
medicalappointment_revision$Medicalappt.show <- sapply(medicalappointment_revision$Medicalappt.show, as.numeric)
head(medicalappointment_revision)
## Gender Age Scholarship Hipertension Diabetes Alcoholism Handcap
## 1 F 37 0 0 0 0 0
## 2 F 58 0 0 0 0 0
## 3 F 58 0 0 0 0 0
## 4 M 51 0 0 0 0 0
## 5 F 64 0 1 0 0 0
## 6 F 36 0 0 0 0 0
## SMS_received Medicalappt.show
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
summary(medicalappointment_revision)
## Gender Age Scholarship Hipertension
## M:38687 Min. : -1.00 Min. :0.00000 Min. :0.0000
## F:71840 1st Qu.: 18.00 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 37.00 Median :0.00000 Median :0.0000
## Mean : 37.09 Mean :0.09827 Mean :0.1972
## 3rd Qu.: 55.00 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :115.00 Max. :1.00000 Max. :1.0000
## Diabetes Alcoholism Handcap SMS_received
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.000
## Mean :0.07186 Mean :0.0304 Mean :0.02225 Mean :0.321
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :1.00000 Max. :1.0000 Max. :4.00000 Max. :1.000
## Medicalappt.show
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.2019
## 3rd Qu.:0.0000
## Max. :1.0000
attach(medicalappointment_revision)
medicalappointment.lm <- lm(Medicalappt.show ~ Gender + Age + Scholarship + Hipertension + Diabetes + Alcoholism + Handcap + SMS_received)
medicalappointment.lm
##
## Call:
## lm(formula = Medicalappt.show ~ Gender + Age + Scholarship +
## Hipertension + Diabetes + Alcoholism + Handcap + SMS_received)
##
## Coefficients:
## (Intercept) GenderF Age Scholarship Hipertension
## 0.200122 0.002627 -0.001021 0.030899 -0.009529
## Diabetes Alcoholism Handcap SMS_received
## 0.012640 0.020963 0.005224 0.109500
summary(medicalappointment.lm)
##
## Call:
## lm(formula = Medicalappt.show ~ Gender + Age + Scholarship +
## Hipertension + Diabetes + Alcoholism + Handcap + SMS_received)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3432 -0.2132 -0.1700 -0.1272 0.9094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.001e-01 2.848e-03 70.267 < 2e-16 ***
## GenderF 2.627e-03 2.563e-03 1.025 0.30538
## Age -1.021e-03 6.102e-05 -16.733 < 2e-16 ***
## Scholarship 3.090e-02 4.073e-03 7.586 3.33e-14 ***
## Hipertension -9.529e-03 3.717e-03 -2.564 0.01036 *
## Diabetes 1.264e-02 5.161e-03 2.449 0.01432 *
## Alcoholism 2.096e-02 7.066e-03 2.967 0.00301 **
## Handcap 5.224e-03 7.437e-03 0.702 0.48241
## SMS_received 1.095e-01 2.565e-03 42.698 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3973 on 110518 degrees of freedom
## Multiple R-squared: 0.02053, Adjusted R-squared: 0.02045
## F-statistic: 289.5 on 8 and 110518 DF, p-value: < 2.2e-16
plot(fitted(medicalappointment.lm ), resid(medicalappointment.lm ))
hist(resid(medicalappointment.lm), col = "lightblue", border = "pink")
qqnorm(resid(medicalappointment.lm))
qqline(resid(medicalappointment.lm))