Some of the analysis being done may have errors to be updated over time. Findings in this document should not be used for generalisation.
data<-read.csv("/Users/GreatWallUser/Documents/Rstudio Class/csec analysis 2017.csv")
head(data,20) # first 20 rows of the data set
## names Grade.1 Grade.2 Grade.3 Grade.4 Grade.5 total.subjects
## 1 albert tiffany 1 3 3 NA 2 9
## 2 alexander jemuel 2 3 3 NA NA 9
## 3 alli nikita 1 1 6 1 1 10
## 4 archer devon 1 2 2 NA 1 7
## 5 archer joanna 2 3 2 NA NA 8
## 6 aziz tauriq 5 5 NA 1 NA 11
## 7 bacchus saif 2 4 3 NA NA 9
## 8 baksh mohamed saieed 7 2 3 1 NA 13
## 9 barrington malik 1 2 4 1 NA 8
## 10 barry casselle 1 3 4 NA NA 8
## 11 basir aftab 1 1 4 2 NA 8
## 12 bijulisingh rebekak 3 7 NA 1 NA 11
## 13 bisnauth bonita NA NA 2 3 2 7
## 14 bissessar vimaldat 5 3 3 NA NA 11
## 15 boston shania 4 2 2 NA NA 8
## 16 browne joyann 2 4 5 NA NA 11
## 17 calistro drexel 1 4 4 1 NA 10
## 18 chando mohindar 4 5 NA NA NA 9
## 19 charles adero NA NA 3 2 NA 5
## 20 chee-a-tow brandon NA 2 4 3 NA 9
## X balance exam.year
## 1 NA 9 2014
## 2 2 8 2015
## 3 NA 10 2017
## 4 2 6 2017
## 5 2 7 2015
## 6 NA 11 2017
## 7 NA 9 2014
## 8 NA 13 2015
## 9 NA 8 2017
## 10 NA 8 2016
## 11 NA 8 2013
## 12 NA 11 2017
## 13 NA 7 2013
## 14 NA 11 2016
## 15 NA 8 2015
## 16 NA 11 2014
## 17 NA 10 2015
## 18 NA 9 2016
## 19 NA 5 2013
## 20 NA 9 2017
str(data)
## 'data.frame': 981 obs. of 10 variables:
## $ names : Factor w/ 981 levels "Abraham Anthony",..: 8 9 18 26 27 39 40 52 56 57 ...
## $ Grade.1 : int 1 2 1 1 2 5 2 7 1 1 ...
## $ Grade.2 : int 3 3 1 2 3 5 4 2 2 3 ...
## $ Grade.3 : int 3 3 6 2 2 NA 3 3 4 4 ...
## $ Grade.4 : int NA NA 1 NA NA 1 NA 1 1 NA ...
## $ Grade.5 : int 2 NA 1 1 NA NA NA NA NA NA ...
## $ total.subjects: int 9 9 10 7 8 11 9 13 8 8 ...
## $ X : int NA 2 NA 2 2 NA NA NA NA NA ...
## $ balance : Factor w/ 18 levels "-1","#VALUE!",..: 18 17 5 15 16 6 18 8 17 17 ...
## $ exam.year : int 2014 2015 2017 2017 2015 2017 2014 2015 2017 2016 ...
summary(data)
## names Grade.1 Grade.2 Grade.3
## Abraham Anthony : 1 Min. : 1.000 Min. :1.000 Min. :1.000
## Abraham Christian : 1 1st Qu.: 2.000 1st Qu.:2.000 1st Qu.:2.000
## Abraham Christopher: 1 Median : 3.000 Median :3.000 Median :3.000
## Abraham Isaac : 1 Mean : 2.948 Mean :2.979 Mean :2.582
## Abraham Michelle : 1 3rd Qu.: 4.000 3rd Qu.:4.000 3rd Qu.:3.000
## Abraham Peter : 1 Max. :11.000 Max. :8.000 Max. :6.000
## (Other) :975 NA's :215 NA's :177 NA's :277
## Grade.4 Grade.5 total.subjects X balance
## Min. :1.000 Min. :1.000 Min. : 0.000 Min. :1.000 10 :133
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 7.000 1st Qu.:2.000 9 :124
## Median :2.000 Median :1.000 Median : 9.000 Median :2.000 7 :122
## Mean :2.434 Mean :1.621 Mean : 8.638 Mean :2.382 8 :119
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:10.000 3rd Qu.:3.000 5 :102
## Max. :6.000 Max. :3.000 Max. :16.000 Max. :3.000 6 : 96
## NA's :347 NA's :704 NA's :858 (Other):285
## exam.year
## Min. :2013
## 1st Qu.:2014
## Median :2015
## Mean :2015
## 3rd Qu.:2016
## Max. :2017
##
data$Grade.1 = ifelse(is.na(data$Grade.1),
ave(data$Grade.1, FUN = function(x) mean(x, na.rm = TRUE)),
data$Grade.1)
data$Grade.2 = ifelse(is.na(data$Grade.2),
ave(data$Grade.2, FUN = function(x) mean(x, na.rm = TRUE)),
data$Grade.2)
data$Grade.3 = ifelse(is.na(data$Grade.3),
ave(data$Grade.3, FUN = function(x) mean(x, na.rm = TRUE)),
data$Grade.3)
data$Grade.4 = ifelse(is.na(data$Grade.4),
ave(data$Grade.4, FUN = function(x) mean(x, na.rm = TRUE)),
data$Grade.4)
data$Grade.5 = ifelse(is.na(data$Grade.5),
ave(data$Grade.4, FUN = function(x) mean(x, na.rm = TRUE)),
data$Grade.5)
head(data$Grade.4)
## [1] 2.433754 2.433754 1.000000 2.433754 2.433754 1.000000
data$Grade.1 <- as.integer(data$Grade.1)
data$Grade.2 <- as.integer(data$Grade.2)
data$Grade.3 <- as.integer(data$Grade.3)
data$Grade.4 <- as.integer(data$Grade.4)
data$Grade.5 <- as.integer(data$Grade.5)
data$balance <- as.integer(data$balance)
head(data,10)
## names Grade.1 Grade.2 Grade.3 Grade.4 Grade.5 total.subjects
## 1 albert tiffany 1 3 3 2 2 9
## 2 alexander jemuel 2 3 3 2 2 9
## 3 alli nikita 1 1 6 1 1 10
## 4 archer devon 1 2 2 2 1 7
## 5 archer joanna 2 3 2 2 2 8
## 6 aziz tauriq 5 5 2 1 2 11
## 7 bacchus saif 2 4 3 2 2 9
## 8 baksh mohamed saieed 7 2 3 1 2 13
## 9 barrington malik 1 2 4 1 2 8
## 10 barry casselle 1 3 4 2 2 8
## X balance exam.year
## 1 NA 18 2014
## 2 2 17 2015
## 3 NA 5 2017
## 4 2 15 2017
## 5 2 16 2015
## 6 NA 6 2017
## 7 NA 18 2014
## 8 NA 8 2015
## 9 NA 17 2017
## 10 NA 17 2016
str(data)
## 'data.frame': 981 obs. of 10 variables:
## $ names : Factor w/ 981 levels "Abraham Anthony",..: 8 9 18 26 27 39 40 52 56 57 ...
## $ Grade.1 : int 1 2 1 1 2 5 2 7 1 1 ...
## $ Grade.2 : int 3 3 1 2 3 5 4 2 2 3 ...
## $ Grade.3 : int 3 3 6 2 2 2 3 3 4 4 ...
## $ Grade.4 : int 2 2 1 2 2 1 2 1 1 2 ...
## $ Grade.5 : int 2 2 1 1 2 2 2 2 2 2 ...
## $ total.subjects: int 9 9 10 7 8 11 9 13 8 8 ...
## $ X : int NA 2 NA 2 2 NA NA NA NA NA ...
## $ balance : int 18 17 5 15 16 6 18 8 17 17 ...
## $ exam.year : int 2014 2015 2017 2017 2015 2017 2014 2015 2017 2016 ...
data$UnGraded <- data$X
data$X <- NULL
#data %>% rename("Graded" = X)
head(data)
## names Grade.1 Grade.2 Grade.3 Grade.4 Grade.5 total.subjects
## 1 albert tiffany 1 3 3 2 2 9
## 2 alexander jemuel 2 3 3 2 2 9
## 3 alli nikita 1 1 6 1 1 10
## 4 archer devon 1 2 2 2 1 7
## 5 archer joanna 2 3 2 2 2 8
## 6 aziz tauriq 5 5 2 1 2 11
## balance exam.year UnGraded
## 1 18 2014 NA
## 2 17 2015 2
## 3 5 2017 NA
## 4 15 2017 2
## 5 16 2015 2
## 6 6 2017 NA
cordata = data[,c(2,3,4,5,6)]
corr <- round(cor(cordata), 1)
corr
## Grade.1 Grade.2 Grade.3 Grade.4 Grade.5
## Grade.1 1.0 -0.1 -0.1 0.0 0.1
## Grade.2 -0.1 1.0 -0.3 0.0 0.1
## Grade.3 -0.1 -0.3 1.0 -0.2 0.0
## Grade.4 0.0 0.0 -0.2 1.0 -0.3
## Grade.5 0.1 0.1 0.0 -0.3 1.0
ggcorrplot(corr,lab = TRUE, method = "circle",show.diag = TRUE, title="Correlogram showing the relationship among Scores")
data$UnGraded <- as.factor(data$UnGraded)
data$UnGraded <- as.numeric(data$UnGraded)
data$UnGraded <- ifelse(data$UnGraded==1, "0 Ungraded", ifelse(data$UnGraded==2, "1 Ungraded", "3 Ungraded"))
Totalgrade <- c(data$UnGraded)
barplot(table(Totalgrade),
main="BarPlot showing the total Number of Subjects that are ungraded",
names.arg=c("No Ungraded","1 Ungraded","2 Ungraded"),
xlab="Ungraded subjects",
ylab="% Number of students",
border="red",
col.axis="red",
col="blue",
density=10
)
data$UnGraded <- as.factor(data$UnGraded)
data$UnGraded <- as.numeric(data$UnGraded)
data$UnGraded <- ifelse(data$UnGraded==1, "0 Ungraded", ifelse(data$UnGraded==2, "1 Ungraded", "3 Ungraded"))
Totalgrade <- c(data$UnGraded)
table(Totalgrade)
## Totalgrade
## 0 Ungraded 1 Ungraded 3 Ungraded
## 3 70 50
str(data)
## 'data.frame': 981 obs. of 10 variables:
## $ names : Factor w/ 981 levels "Abraham Anthony",..: 8 9 18 26 27 39 40 52 56 57 ...
## $ Grade.1 : int 1 2 1 1 2 5 2 7 1 1 ...
## $ Grade.2 : int 3 3 1 2 3 5 4 2 2 3 ...
## $ Grade.3 : int 3 3 6 2 2 2 3 3 4 4 ...
## $ Grade.4 : int 2 2 1 2 2 1 2 1 1 2 ...
## $ Grade.5 : int 2 2 1 1 2 2 2 2 2 2 ...
## $ total.subjects: int 9 9 10 7 8 11 9 13 8 8 ...
## $ balance : int 18 17 5 15 16 6 18 8 17 17 ...
## $ exam.year : int 2014 2015 2017 2017 2015 2017 2014 2015 2017 2016 ...
## $ UnGraded : chr NA "1 Ungraded" NA "1 Ungraded" ...
#MOdel 1 - simple linear model
model_1list <- data %>%
select(names, Grade.1, Grade.2, Grade.3, Grade.4, Grade.5, total.subjects, UnGraded, balance,exam.year) %>%
filter(exam.year == "2016")
model_1 <- lm(total.subjects~ Grade.1, data = model_1list)
summary(model_1)
##
## Call:
## lm(formula = total.subjects ~ Grade.1, data = model_1list)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.1128 -1.1128 -0.1128 1.5808 5.8872
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.7257 0.3621 18.573 < 2e-16 ***
## Grade.1 0.6935 0.1151 6.026 7.59e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.422 on 207 degrees of freedom
## Multiple R-squared: 0.1492, Adjusted R-squared: 0.1451
## F-statistic: 36.31 on 1 and 207 DF, p-value: 7.59e-09
ggqqplot(residuals(model_1))
The R-squared value being 0.1492, only 15 % of the variance in the response variable is explained by the predictor variable (grade.1) in 2016
#MOdel 2 - simple linear model
model_2list <- data %>%
select(names, Grade.1, Grade.2, Grade.3, Grade.4, Grade.5, total.subjects, UnGraded, balance,exam.year) %>%
filter(exam.year == "2017")
model_2 <- lm(total.subjects~ Grade.1, data = model_2list)
summary(model_2)
##
## Call:
## lm(formula = total.subjects ~ Grade.1, data = model_2list)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9183 -1.7032 0.0817 1.8667 5.2968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.3484 0.3477 18.257 < 2e-16 ***
## Grade.1 0.7850 0.1097 7.155 1.4e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.446 on 208 degrees of freedom
## Multiple R-squared: 0.1975, Adjusted R-squared: 0.1937
## F-statistic: 51.2 on 1 and 208 DF, p-value: 1.404e-11
ggqqplot(residuals(model_2))
The R-squared value being 0.1975, only 19.8 % of the variance in the response variable is explained by the predictor variable (grade.1) n 2017
data$exam.year <- as.factor(data$exam.year)
str(data)
## 'data.frame': 981 obs. of 10 variables:
## $ names : Factor w/ 981 levels "Abraham Anthony",..: 8 9 18 26 27 39 40 52 56 57 ...
## $ Grade.1 : int 1 2 1 1 2 5 2 7 1 1 ...
## $ Grade.2 : int 3 3 1 2 3 5 4 2 2 3 ...
## $ Grade.3 : int 3 3 6 2 2 2 3 3 4 4 ...
## $ Grade.4 : int 2 2 1 2 2 1 2 1 1 2 ...
## $ Grade.5 : int 2 2 1 1 2 2 2 2 2 2 ...
## $ total.subjects: int 9 9 10 7 8 11 9 13 8 8 ...
## $ balance : int 18 17 5 15 16 6 18 8 17 17 ...
## $ exam.year : Factor w/ 5 levels "2013","2014",..: 2 3 5 5 3 5 2 3 5 4 ...
## $ UnGraded : chr NA "1 Ungraded" NA "1 Ungraded" ...
head(data)
## names Grade.1 Grade.2 Grade.3 Grade.4 Grade.5 total.subjects
## 1 albert tiffany 1 3 3 2 2 9
## 2 alexander jemuel 2 3 3 2 2 9
## 3 alli nikita 1 1 6 1 1 10
## 4 archer devon 1 2 2 2 1 7
## 5 archer joanna 2 3 2 2 2 8
## 6 aziz tauriq 5 5 2 1 2 11
## balance exam.year UnGraded
## 1 18 2014 <NA>
## 2 17 2015 1 Ungraded
## 3 5 2017 <NA>
## 4 15 2017 1 Ungraded
## 5 16 2015 1 Ungraded
## 6 6 2017 <NA>
year2013 <- length(which(data$exam.year == "2013"))
year2014 <- length(which(data$exam.year == "2014"))
year2015 <- length(which(data$exam.year == "2015"))
year2016 <- length(which(data$exam.year == "2016"))
year2017 <- length(which(data$exam.year == "2017"))
slices <- c(year2013, year2014, year2015, year2016, year2017)
lbls <- c("2013", "2014", "2015", "2016", "2017")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents values to labels
lbls <- paste(lbls,"%",sep="") # add "%" to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
main="Pie Chart showing proportion of the students who wrote CSEC")