library('data.table')
library("ggpubr")
library('dplyr')
library(tidyr)
library(kableExtra)
library(ggthemes)
library(gridExtra)
library(car) 
library(Hmisc)
library(psychometric)
library(granova)

options(scipen=999)

1. Student Details


Rashbir Singh Kohli (s3810585)


2. Introduction



3. Problem Statement



Part A



4. Data (Problem Statment 1)


ALOSDf <- fread('Assignment 4A.csv', header = TRUE)

names(ALOSDf) <- gsub(" ", "", names(ALOSDf)) #Removing white spaces in column name
names(ALOSDf) <- gsub("LHN", "", names(ALOSDf)) #Removing alphabets inside the bracket
names(ALOSDf) <- gsub("days", "InDays", names(ALOSDf)) #Subsituting 'days' with'InDays'
names(ALOSDf) <- gsub("[^A-z]", "", names(ALOSDf)) #Removing everyting except the alphabets
ALOSDfInterest <- ALOSDf[ALOSDf$LocalHospitalNetwork == 'South Western Sydney',
                         c("LocalHospitalNetwork", "AveragelengthofstayInDays")]
## Removing NP and -
ALOSDfInterest <- ALOSDfInterest[ALOSDfInterest$AveragelengthofstayInDays != '-', ]
ALOSDfInterest <- ALOSDfInterest[ALOSDfInterest$AveragelengthofstayInDays != 'NP', ]
## Converting Averagel ength of stay(In Days) to numeric
ALOSDfInterest$AveragelengthofstayInDays <- ALOSDfInterest$AveragelengthofstayInDays %>% as.numeric()
head(ALOSDfInterest)
##    LocalHospitalNetwork AveragelengthofstayInDays
## 1: South Western Sydney                       2.6
## 2: South Western Sydney                       2.6
## 3: South Western Sydney                       2.5
## 4: South Western Sydney                       2.5
## 5: South Western Sydney                       2.5
## 6: South Western Sydney                       2.4

5. Descriptive Statistics and Visualisation (Problem Statment 1)


knitr::kable(ALOSDfInterest %>% summarise(Min = min(AveragelengthofstayInDays,na.rm = TRUE), Max = max(AveragelengthofstayInDays, na.rm = TRUE), n = n(), Missing = sum(is.na(AveragelengthofstayInDays)), Q1 = quantile(AveragelengthofstayInDays ,probs = .25,na.rm = TRUE), Median = median(AveragelengthofstayInDays, na.rm = TRUE), Q3 = quantile(AveragelengthofstayInDays, probs = .75,na.rm = TRUE), Mean = mean(AveragelengthofstayInDays, na.rm = TRUE), SD = sd(AveragelengthofstayInDays, na.rm = TRUE), IQR = IQR(AveragelengthofstayInDays ,na.rm = TRUE)) , "html", caption = "Table 1: Descriptive Statistics", align = "llllllllll", col.names = c("Minimum", "Maximum", "Sample Size", "Missing Count","First Quartile", "Median", "Third Quartile", "Mean", "Standard Deviation", "IQR"), digits = 2) %>% kable_styling(latex_options = "HOLD_position") %>% column_spec(1, bold = TRUE) %>% column_spec(c(2,4,6,8,10), color = 'white', background = 'black')
Table 1: Descriptive Statistics
Minimum Maximum Sample Size Missing Count First Quartile Median Third Quartile Mean Standard Deviation IQR
1.3 11.9 371 0 2.9 3.8 5.5 4.64 2.43 2.6
ggplot(ALOSDfInterest, aes(x=LocalHospitalNetwork, y=AveragelengthofstayInDays)) + geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1.5 ,fill='#4271AE', color="#1F3552") + theme_economist() + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), text = element_text(family="Tahoma"), axis.title = element_text(size = 12)) + scale_x_discrete(name = "\n")+ ggtitle("Boxplot for ALOS for South Western Sydney hospital\n") + scale_y_continuous(name = 'Average Length of Stay (In Days)\n') 

ggplot(ALOSDfInterest, aes(AveragelengthofstayInDays)) + geom_histogram(fill = "#4271AE", color = "#1F3552", binwidth = 0.3, position="identity") + geom_vline(data=ALOSDfInterest, aes(xintercept=mean(ALOSDfInterest$AveragelengthofstayInDays)), colour="red", linetype = "dashed", size = 0.8) + geom_vline(data=ALOSDfInterest, aes(xintercept=median(ALOSDfInterest$AveragelengthofstayInDays) ), colour="orange", linetype = "dashed", size = 0.6) + ggtitle("Frequency histogram of South Western Sydney Hospital\n") + theme_economist() + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), text = element_text(family="Tahoma"), axis.title = element_text(size = 12)) + scale_x_continuous(name = "\nAverage Length of Stay (In Days)") + geom_text(aes(x=5.4, y=33.5, label= 'μ = 4.64', group=NULL), data=ALOSDfInterest[1,], size = 3) + geom_text(aes(x=3, y=36.5, label= 'Median = 3.8', group=NULL), data=ALOSDfInterest[1,], size = 3) + scale_y_continuous(name = 'Frequency\n')

ggqqplot(ALOSDfInterest$AveragelengthofstayInDays, size = 0.5) + ggtitle('QQ Plot for South Western Sydney Hospital ALOS') + theme(plot.title = element_text(hjust = 0.5))


6. Hypothesis Testing (Problem Statment 1)


t.test(ALOSDfInterest$AveragelengthofstayInDays, mu = 4.5, alternative="two.sided", conf.level = 0.95)
## 
##  One Sample t-test
## 
## data:  ALOSDfInterest$AveragelengthofstayInDays
## t = 1.1346, df = 370, p-value = 0.2573
## alternative hypothesis: true mean is not equal to 4.5
## 95 percent confidence interval:
##  4.394867 4.891926
## sample estimates:
## mean of x 
##  4.643396

Part B



7. Data (Problem Statment 2)


TutorialDf <- fread("Assignment 4b-3.csv")
names(TutorialDf) <- gsub(" ", "", names(TutorialDf)) #Removing white spaces in column name
TutorialDfInterest <- TutorialDf[, c('Scorebeforetutorial', 'Scoreaftertutorial')]
head(TutorialDfInterest)
##    Scorebeforetutorial Scoreaftertutorial
## 1:                  50                 42
## 2:                  13                 38
## 3:                  27                 43
## 4:                  44                 37
## 5:                  35                 35
## 6:                  55                 41

8. Descriptive Statistics and Visualisation (Problem Statment 2)


TutorialDfInterest <- TutorialDfInterest %>% mutate(difference = Scoreaftertutorial - Scorebeforetutorial)
GatherDf <- TutorialDfInterest %>% gather(Scorebeforetutorial, Scoreaftertutorial, difference, key = 'Parameter', value = 'value')
knitr::kable(GatherDf %>% group_by(GatherDf$Parameter) %>% summarise(Min = min(value,na.rm = TRUE),Max = max(value, na.rm = TRUE), n = n(), Missing = sum(is.na(value)), Q1 = quantile(value ,probs = .25,na.rm = TRUE), Median = median(value, na.rm = TRUE), Q3 = quantile(value, probs = .75,na.rm = TRUE), Mean = mean(value, na.rm = TRUE), SD = sd(value, na.rm = TRUE), IQR = IQR(value ,na.rm = TRUE)), "html", caption = "Table 1: Descriptive Statistics", align = "llllllllll", col.names = c("Score Type", "Minimum", "Maximum", "Sample Size", "Missing Count","First Quartile", "Median", "Third Quartile", "Mean", "Standard Deviation", "IQR"), digits = 2) %>% kable_styling(latex_options = "HOLD_position") %>% column_spec(1, bold = TRUE) %>% column_spec(c(2,4,6,8,10), color = 'white', background = 'black')
Table 1: Descriptive Statistics
Score Type Minimum Maximum Sample Size Missing Count First Quartile Median Third Quartile Mean Standard Deviation IQR
difference -14 31 1290 0 0 0 14 5.35 10.04 14
Scoreaftertutorial 33 55 1290 0 37 41 44 41.17 4.95 7
Scorebeforetutorial 13 55 1290 0 27 37 44 35.82 10.54 17
ggplot(GatherDf, aes(x=Parameter, y=value)) + geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1.5 ,fill='#4271AE', color="#1F3552") + theme_economist() + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), text = element_text(family="Tahoma"), axis.title = element_text(size = 12)) + scale_x_discrete(name = "\nScore type")+ ggtitle("Boxplot for Before and After tutorial score\n") + scale_y_continuous(name = 'Score of students\n')

hist(TutorialDfInterest$Scorebeforetutorial, breaks = 20, probability = TRUE, xlab = 'Test Score', ylab = 'Frequency', main = "Histogram for Score Before Tutorial"); abline(v = mean(TutorialDfInterest$Scorebeforetutorial), col="red", lwd=2, lty=2); abline(v = median(TutorialDfInterest$Scorebeforetutorial), col="orange", lwd=2, lty=2); text(x=31, y=0.0505, labels= 'μ = 35.82', cex = 0.72); text(x=40, y=0.055, labels= 'Median = 37', cex = 0.73); lines(density(TutorialDfInterest$Scorebeforetutorial), col = 'Blue', lwd=2); curve(dnorm(x, mean=mean(TutorialDfInterest$Scorebeforetutorial), sd=sd(TutorialDfInterest$Scorebeforetutorial)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE); legend("topright", legend = c("Density Curve for Score Before Tutorial", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'), cex = 0.60)

hist(TutorialDfInterest$Scoreaftertutorial, breaks = 20, probability = TRUE, xlab = 'Test Score', ylab = 'Frequency', main = "Histogram for Score After Tutorial"); abline(v = mean(TutorialDfInterest$Scoreaftertutorial), col="red", lwd=2, lty=2); abline(v = median(TutorialDfInterest$Scoreaftertutorial), col="orange", lwd=2, lty=2); text(x=42.5, y=0.12, labels= 'μ = 41.17', cex = 0.72); text(x=39, y=0.09, labels= 'Median = 41', cex = 0.73); lines(density(TutorialDfInterest$Scoreaftertutorial), col = 'Blue', lwd=2); curve(dnorm(x, mean=mean(TutorialDfInterest$Scoreaftertutorial), sd=sd(TutorialDfInterest$Scoreaftertutorial)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE); legend("topright", legend = c("Density Curve for Score After Tutorial", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'), cex = 0.60)

hist(TutorialDfInterest$difference, breaks = 20, probability = TRUE, xlab = 'Test Score', ylab = 'Frequency', main = "Histogram for Score Difference"); abline(v = mean(TutorialDfInterest$difference), col="red", lwd=2, lty=2); abline(v = median(TutorialDfInterest$difference), col="orange", lwd=2, lty=2); text(x=8, y=0.12, labels= 'μ = 5.35', cex = 0.72); text(x=-5, y=0.15, labels= 'Median = 0', cex = 0.73); lines(density(TutorialDfInterest$difference), col = 'Blue', lwd=2); curve(dnorm(x, mean=mean(TutorialDfInterest$difference), sd=sd(TutorialDfInterest$difference)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE); legend("topright", legend = c("Density Curve for Score After Tutorial", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'), cex = 0.60)

ggqqplot(TutorialDfInterest$Scorebeforetutorial, size = 0.5) + ggtitle('QQ Plot for Score Before Tutorial') + theme(plot.title = element_text(hjust = 0.5))

ggqqplot(TutorialDfInterest$Scoreaftertutorial, size = 0.5) +  ggtitle('QQ Plot for Score After Tutorial') + theme(plot.title = element_text(hjust = 0.5))

ggqqplot(TutorialDfInterest$difference, size = 0.5) +  ggtitle('QQ Plot for Score Differece b/w score') + theme(plot.title = element_text(hjust = 0.5))


9. Hypothesis Testing (Problem Statment 2)


t.test(TutorialDfInterest$Scoreaftertutorial, TutorialDfInterest$Scorebeforetutorial,
       paired = TRUE,
       alternative = "greater")
## 
##  Paired t-test
## 
## data:  TutorialDfInterest$Scoreaftertutorial and TutorialDfInterest$Scorebeforetutorial
## t = 19.144, df = 1289, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  4.890355      Inf
## sample estimates:
## mean of the differences 
##                5.350388
matplot(t(data.frame(TutorialDfInterest$Scorebeforetutorial, TutorialDfInterest$Scoreaftertutorial)), type = "b", pch = 19, col = 1, lty = 1, xlab = "Tutorial Type", ylab = "Student Score Number", xaxt = "n") 
axis(1, at = 1:2, labels = c("Before", "After"))

granova.ds( data.frame(TutorialDfInterest$Scoreaftertutorial, TutorialDfInterest$Scorebeforetutorial), xlab = "Student's Score - Before", ylab = "Student's Score - After")

##             Summary Stats
## n                1290.000
## mean(x)            41.171
## mean(y)            35.821
## mean(D=x-y)         5.350
## SD(D)              10.038
## ES(D)               0.533
## r(x,y)              0.334
## r(x+y,d)           -0.660
## LL 95%CI            4.802
## UL 95%CI            5.899
## t(D-bar)           19.144
## df.t             1289.000
## pval.t              0.000

10. Discussion



11. References


[1] “Admitted patients”, Australian Institute of Health and Welfare 2020. [Online]. Available: https://www.aihw.gov.au/reports-data/myhospitals/sectors/admitted-patients. [Accessed: 10-May-2020].