library('data.table')
library("ggpubr")
library('dplyr')
library(tidyr)
library(kableExtra)
library(ggthemes)
library(gridExtra)
library(car)
library(Hmisc)
library(psychometric)
library(granova)
options(scipen=999)
Rashbir Singh Kohli (s3810585)
Assignment 4b-3.csv, data that have information about the impact of tutorials on the performance of the students.Assignment 4A.csv is used, and read using fread() function from data.table library keeping header = TRUE.gsub() function to remove white spaces, brackets, and text between them.NP and - symbols.AveragelengthofstayInDays column was converted to numeric using as.numeric() function.ALOSDf <- fread('Assignment 4A.csv', header = TRUE)
names(ALOSDf) <- gsub(" ", "", names(ALOSDf)) #Removing white spaces in column name
names(ALOSDf) <- gsub("LHN", "", names(ALOSDf)) #Removing alphabets inside the bracket
names(ALOSDf) <- gsub("days", "InDays", names(ALOSDf)) #Subsituting 'days' with'InDays'
names(ALOSDf) <- gsub("[^A-z]", "", names(ALOSDf)) #Removing everyting except the alphabets
ALOSDfInterest <- ALOSDf[ALOSDf$LocalHospitalNetwork == 'South Western Sydney',
c("LocalHospitalNetwork", "AveragelengthofstayInDays")]
## Removing NP and -
ALOSDfInterest <- ALOSDfInterest[ALOSDfInterest$AveragelengthofstayInDays != '-', ]
ALOSDfInterest <- ALOSDfInterest[ALOSDfInterest$AveragelengthofstayInDays != 'NP', ]
## Converting Averagel ength of stay(In Days) to numeric
ALOSDfInterest$AveragelengthofstayInDays <- ALOSDfInterest$AveragelengthofstayInDays %>% as.numeric()
head(ALOSDfInterest)
## LocalHospitalNetwork AveragelengthofstayInDays
## 1: South Western Sydney 2.6
## 2: South Western Sydney 2.6
## 3: South Western Sydney 2.5
## 4: South Western Sydney 2.5
## 5: South Western Sydney 2.5
## 6: South Western Sydney 2.4
knitr::kable(ALOSDfInterest %>% summarise(Min = min(AveragelengthofstayInDays,na.rm = TRUE), Max = max(AveragelengthofstayInDays, na.rm = TRUE), n = n(), Missing = sum(is.na(AveragelengthofstayInDays)), Q1 = quantile(AveragelengthofstayInDays ,probs = .25,na.rm = TRUE), Median = median(AveragelengthofstayInDays, na.rm = TRUE), Q3 = quantile(AveragelengthofstayInDays, probs = .75,na.rm = TRUE), Mean = mean(AveragelengthofstayInDays, na.rm = TRUE), SD = sd(AveragelengthofstayInDays, na.rm = TRUE), IQR = IQR(AveragelengthofstayInDays ,na.rm = TRUE)) , "html", caption = "Table 1: Descriptive Statistics", align = "llllllllll", col.names = c("Minimum", "Maximum", "Sample Size", "Missing Count","First Quartile", "Median", "Third Quartile", "Mean", "Standard Deviation", "IQR"), digits = 2) %>% kable_styling(latex_options = "HOLD_position") %>% column_spec(1, bold = TRUE) %>% column_spec(c(2,4,6,8,10), color = 'white', background = 'black')
| Minimum | Maximum | Sample Size | Missing Count | First Quartile | Median | Third Quartile | Mean | Standard Deviation | IQR |
|---|---|---|---|---|---|---|---|---|---|
| 1.3 | 11.9 | 371 | 0 | 2.9 | 3.8 | 5.5 | 4.64 | 2.43 | 2.6 |
ggplot(ALOSDfInterest, aes(x=LocalHospitalNetwork, y=AveragelengthofstayInDays)) + geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1.5 ,fill='#4271AE', color="#1F3552") + theme_economist() + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), text = element_text(family="Tahoma"), axis.title = element_text(size = 12)) + scale_x_discrete(name = "\n")+ ggtitle("Boxplot for ALOS for South Western Sydney hospital\n") + scale_y_continuous(name = 'Average Length of Stay (In Days)\n')
mean(\(\mu\)) is 4.64 and for median is 3.80.ggplot(ALOSDfInterest, aes(AveragelengthofstayInDays)) + geom_histogram(fill = "#4271AE", color = "#1F3552", binwidth = 0.3, position="identity") + geom_vline(data=ALOSDfInterest, aes(xintercept=mean(ALOSDfInterest$AveragelengthofstayInDays)), colour="red", linetype = "dashed", size = 0.8) + geom_vline(data=ALOSDfInterest, aes(xintercept=median(ALOSDfInterest$AveragelengthofstayInDays) ), colour="orange", linetype = "dashed", size = 0.6) + ggtitle("Frequency histogram of South Western Sydney Hospital\n") + theme_economist() + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), text = element_text(family="Tahoma"), axis.title = element_text(size = 12)) + scale_x_continuous(name = "\nAverage Length of Stay (In Days)") + geom_text(aes(x=5.4, y=33.5, label= 'μ = 4.64', group=NULL), data=ALOSDfInterest[1,], size = 3) + geom_text(aes(x=3, y=36.5, label= 'Median = 3.8', group=NULL), data=ALOSDfInterest[1,], size = 3) + scale_y_continuous(name = 'Frequency\n')
ggqqplot(ALOSDfInterest$AveragelengthofstayInDays, size = 0.5) + ggtitle('QQ Plot for South Western Sydney Hospital ALOS') + theme(plot.title = element_text(hjust = 0.5))
t.test(ALOSDfInterest$AveragelengthofstayInDays, mu = 4.5, alternative="two.sided", conf.level = 0.95)
##
## One Sample t-test
##
## data: ALOSDfInterest$AveragelengthofstayInDays
## t = 1.1346, df = 370, p-value = 0.2573
## alternative hypothesis: true mean is not equal to 4.5
## 95 percent confidence interval:
## 4.394867 4.891926
## sample estimates:
## mean of x
## 4.643396
Assignment 4b-3.csv is used, and read using fread() function from data.table library.gsub() function to remove white spaces.TutorialDf <- fread("Assignment 4b-3.csv")
names(TutorialDf) <- gsub(" ", "", names(TutorialDf)) #Removing white spaces in column name
TutorialDfInterest <- TutorialDf[, c('Scorebeforetutorial', 'Scoreaftertutorial')]
head(TutorialDfInterest)
## Scorebeforetutorial Scoreaftertutorial
## 1: 50 42
## 2: 13 38
## 3: 27 43
## 4: 44 37
## 5: 35 35
## 6: 55 41
TutorialDfInterest <- TutorialDfInterest %>% mutate(difference = Scoreaftertutorial - Scorebeforetutorial)
GatherDf <- TutorialDfInterest %>% gather(Scorebeforetutorial, Scoreaftertutorial, difference, key = 'Parameter', value = 'value')
knitr::kable(GatherDf %>% group_by(GatherDf$Parameter) %>% summarise(Min = min(value,na.rm = TRUE),Max = max(value, na.rm = TRUE), n = n(), Missing = sum(is.na(value)), Q1 = quantile(value ,probs = .25,na.rm = TRUE), Median = median(value, na.rm = TRUE), Q3 = quantile(value, probs = .75,na.rm = TRUE), Mean = mean(value, na.rm = TRUE), SD = sd(value, na.rm = TRUE), IQR = IQR(value ,na.rm = TRUE)), "html", caption = "Table 1: Descriptive Statistics", align = "llllllllll", col.names = c("Score Type", "Minimum", "Maximum", "Sample Size", "Missing Count","First Quartile", "Median", "Third Quartile", "Mean", "Standard Deviation", "IQR"), digits = 2) %>% kable_styling(latex_options = "HOLD_position") %>% column_spec(1, bold = TRUE) %>% column_spec(c(2,4,6,8,10), color = 'white', background = 'black')
| Score Type | Minimum | Maximum | Sample Size | Missing Count | First Quartile | Median | Third Quartile | Mean | Standard Deviation | IQR |
|---|---|---|---|---|---|---|---|---|---|---|
| difference | -14 | 31 | 1290 | 0 | 0 | 0 | 14 | 5.35 | 10.04 | 14 |
| Scoreaftertutorial | 33 | 55 | 1290 | 0 | 37 | 41 | 44 | 41.17 | 4.95 | 7 |
| Scorebeforetutorial | 13 | 55 | 1290 | 0 | 27 | 37 | 44 | 35.82 | 10.54 | 17 |
ggplot(GatherDf, aes(x=Parameter, y=value)) + geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1.5 ,fill='#4271AE', color="#1F3552") + theme_economist() + theme(plot.title = element_text(family="Tahoma", hjust = 0.5), text = element_text(family="Tahoma"), axis.title = element_text(size = 12)) + scale_x_discrete(name = "\nScore type")+ ggtitle("Boxplot for Before and After tutorial score\n") + scale_y_continuous(name = 'Score of students\n')
hist(TutorialDfInterest$Scorebeforetutorial, breaks = 20, probability = TRUE, xlab = 'Test Score', ylab = 'Frequency', main = "Histogram for Score Before Tutorial"); abline(v = mean(TutorialDfInterest$Scorebeforetutorial), col="red", lwd=2, lty=2); abline(v = median(TutorialDfInterest$Scorebeforetutorial), col="orange", lwd=2, lty=2); text(x=31, y=0.0505, labels= 'μ = 35.82', cex = 0.72); text(x=40, y=0.055, labels= 'Median = 37', cex = 0.73); lines(density(TutorialDfInterest$Scorebeforetutorial), col = 'Blue', lwd=2); curve(dnorm(x, mean=mean(TutorialDfInterest$Scorebeforetutorial), sd=sd(TutorialDfInterest$Scorebeforetutorial)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE); legend("topright", legend = c("Density Curve for Score Before Tutorial", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'), cex = 0.60)
hist(TutorialDfInterest$Scoreaftertutorial, breaks = 20, probability = TRUE, xlab = 'Test Score', ylab = 'Frequency', main = "Histogram for Score After Tutorial"); abline(v = mean(TutorialDfInterest$Scoreaftertutorial), col="red", lwd=2, lty=2); abline(v = median(TutorialDfInterest$Scoreaftertutorial), col="orange", lwd=2, lty=2); text(x=42.5, y=0.12, labels= 'μ = 41.17', cex = 0.72); text(x=39, y=0.09, labels= 'Median = 41', cex = 0.73); lines(density(TutorialDfInterest$Scoreaftertutorial), col = 'Blue', lwd=2); curve(dnorm(x, mean=mean(TutorialDfInterest$Scoreaftertutorial), sd=sd(TutorialDfInterest$Scoreaftertutorial)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE); legend("topright", legend = c("Density Curve for Score After Tutorial", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'), cex = 0.60)
hist(TutorialDfInterest$difference, breaks = 20, probability = TRUE, xlab = 'Test Score', ylab = 'Frequency', main = "Histogram for Score Difference"); abline(v = mean(TutorialDfInterest$difference), col="red", lwd=2, lty=2); abline(v = median(TutorialDfInterest$difference), col="orange", lwd=2, lty=2); text(x=8, y=0.12, labels= 'μ = 5.35', cex = 0.72); text(x=-5, y=0.15, labels= 'Median = 0', cex = 0.73); lines(density(TutorialDfInterest$difference), col = 'Blue', lwd=2); curve(dnorm(x, mean=mean(TutorialDfInterest$difference), sd=sd(TutorialDfInterest$difference)), yaxt="n", lty="dotted", col="darkgreen", lwd=4, add=TRUE); legend("topright", legend = c("Density Curve for Score After Tutorial", "Normal Curve", 'Mean', 'Median'), bty = "n", text.col = "black", horiz = F, pch=c(15,15, 15, 15), col = c('Blue', "darkgreen", 'red', 'orange'), cex = 0.60)
ggqqplot(TutorialDfInterest$Scorebeforetutorial, size = 0.5) + ggtitle('QQ Plot for Score Before Tutorial') + theme(plot.title = element_text(hjust = 0.5))
ggqqplot(TutorialDfInterest$Scoreaftertutorial, size = 0.5) + ggtitle('QQ Plot for Score After Tutorial') + theme(plot.title = element_text(hjust = 0.5))
ggqqplot(TutorialDfInterest$difference, size = 0.5) + ggtitle('QQ Plot for Score Differece b/w score') + theme(plot.title = element_text(hjust = 0.5))
t.test(TutorialDfInterest$Scoreaftertutorial, TutorialDfInterest$Scorebeforetutorial,
paired = TRUE,
alternative = "greater")
##
## Paired t-test
##
## data: TutorialDfInterest$Scoreaftertutorial and TutorialDfInterest$Scorebeforetutorial
## t = 19.144, df = 1289, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 4.890355 Inf
## sample estimates:
## mean of the differences
## 5.350388
matplot(t(data.frame(TutorialDfInterest$Scorebeforetutorial, TutorialDfInterest$Scoreaftertutorial)), type = "b", pch = 19, col = 1, lty = 1, xlab = "Tutorial Type", ylab = "Student Score Number", xaxt = "n")
axis(1, at = 1:2, labels = c("Before", "After"))
granova.ds( data.frame(TutorialDfInterest$Scoreaftertutorial, TutorialDfInterest$Scorebeforetutorial), xlab = "Student's Score - Before", ylab = "Student's Score - After")
## Summary Stats
## n 1290.000
## mean(x) 41.171
## mean(y) 35.821
## mean(D=x-y) 5.350
## SD(D) 10.038
## ES(D) 0.533
## r(x,y) 0.334
## r(x+y,d) -0.660
## LL 95%CI 4.802
## UL 95%CI 5.899
## t(D-bar) 19.144
## df.t 1289.000
## pval.t 0.000
[1] “Admitted patients”, Australian Institute of Health and Welfare 2020. [Online]. Available: https://www.aihw.gov.au/reports-data/myhospitals/sectors/admitted-patients. [Accessed: 10-May-2020].