This report was done in fulfilment of an introduction to data mining class assignment. Some of the analysis being done may have errors to be updated over time. Findings in this document should not be used for generalisation.
The purpose of this document is to provide a brief overiew of data analysis and visualization techniques using R. The dataset used in this overview was taken from: https://www.kaggle.com/spscientist/students-performance-in-exams
data<-read.csv("/Users/GreatWallUser/Documents/Rstudio Class/StudentsPerformance.csv")
head(data,10) # Show the first ten rows of the data set
## gender race.ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## 6 female group B associate's degree standard
## 7 female group B some college standard
## 8 male group B some college free/reduced
## 9 male group D high school free/reduced
## 10 female group B high school free/reduced
## test.preparation.course math.score reading.score writing.score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
## 6 none 71 83 78
## 7 completed 88 95 92
## 8 none 40 43 39
## 9 completed 64 64 67
## 10 none 38 60 50
summary(data)#give you a quick summary of the dataframe
## gender race.ethnicity parental.level.of.education lunch
## female:518 group A: 89 associate's degree:222 free/reduced:355
## male :482 group B:190 bachelor's degree :118 standard :645
## group C:319 high school :196
## group D:262 master's degree : 59
## group E:140 some college :226
## some high school :179
## test.preparation.course math.score reading.score writing.score
## completed:358 Min. : 0.00 Min. : 17.00 Min. : 10.00
## none :642 1st Qu.: 57.00 1st Qu.: 59.00 1st Qu.: 57.75
## Median : 66.00 Median : 70.00 Median : 69.00
## Mean : 66.09 Mean : 69.17 Mean : 68.05
## 3rd Qu.: 77.00 3rd Qu.: 79.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00 Max. :100.00
str(data)#give you a quick description of the dataframe variables
## 'data.frame': 1000 obs. of 8 variables:
## $ gender : Factor w/ 2 levels "female","male": 1 1 1 2 2 1 1 2 2 1 ...
## $ race.ethnicity : Factor w/ 5 levels "group A","group B",..: 2 3 2 1 3 2 2 2 4 2 ...
## $ parental.level.of.education: Factor w/ 6 levels "associate's degree",..: 2 5 4 1 5 1 5 5 3 3 ...
## $ lunch : Factor w/ 2 levels "free/reduced",..: 2 2 2 1 2 2 2 1 1 1 ...
## $ test.preparation.course : Factor w/ 2 levels "completed","none": 2 1 2 2 2 2 1 2 1 2 ...
## $ math.score : int 72 69 90 47 76 71 88 40 64 38 ...
## $ reading.score : int 72 90 95 57 78 83 95 43 64 60 ...
## $ writing.score : int 74 88 93 44 75 78 92 39 67 50 ...
freq(data)
## gender frequency percentage cumulative_perc
## 1 female 518 51.8 51.8
## 2 male 482 48.2 100.0
## race.ethnicity frequency percentage cumulative_perc
## 1 group C 319 31.9 31.9
## 2 group D 262 26.2 58.1
## 3 group B 190 19.0 77.1
## 4 group E 140 14.0 91.1
## 5 group A 89 8.9 100.0
## parental.level.of.education frequency percentage cumulative_perc
## 1 some college 226 22.6 22.6
## 2 associate's degree 222 22.2 44.8
## 3 high school 196 19.6 64.4
## 4 some high school 179 17.9 82.3
## 5 bachelor's degree 118 11.8 94.1
## 6 master's degree 59 5.9 100.0
## lunch frequency percentage cumulative_perc
## 1 standard 645 64.5 64.5
## 2 free/reduced 355 35.5 100.0
## test.preparation.course frequency percentage cumulative_perc
## 1 none 642 64.2 64.2
## 2 completed 358 35.8 100.0
## [1] "Variables processed: gender, race.ethnicity, parental.level.of.education, lunch, test.preparation.course"
parentlevel_race <-table(data$parental.level.of.education, data$race.ethnicity)
parentlevel_race
##
## group A group B group C group D group E
## associate's degree 14 41 78 50 39
## bachelor's degree 12 20 40 28 18
## high school 18 48 64 44 22
## master's degree 3 6 19 23 8
## some college 18 37 69 67 35
## some high school 24 38 49 50 18
chisq.test(parentlevel_race, correct=TRUE)$expected #generate the expected counts
##
## group A group B group C group D group E
## associate's degree 19.758 42.18 70.818 58.164 31.08
## bachelor's degree 10.502 22.42 37.642 30.916 16.52
## high school 17.444 37.24 62.524 51.352 27.44
## master's degree 5.251 11.21 18.821 15.458 8.26
## some college 20.114 42.94 72.094 59.212 31.64
## some high school 15.931 34.01 57.101 46.898 25.06
chisq.test(parentlevel_race, correct=TRUE) # test of independence
##
## Pearson's Chi-squared test
##
## data: parentlevel_race
## X-squared = 29.459, df = 20, p-value = 0.07911
#Since the p-value (0.07911) is greater than 0.05, we fail reject the null hypothesis that the parent level of education is not associated with the race of students.
StudentsPerformance <- data %>%
#Use mutate function to add column to dataset.
mutate(Reading.Assessment = ifelse(reading.score <=50,"Fail","Pass"))
Ga <- length(which(StudentsPerformance$race.ethnicity == "group A"))
Gb <- length(which(StudentsPerformance$race.ethnicity == "group B"))
Gc <- length(which(StudentsPerformance$race.ethnicity == "group C"))
Gd <- length(which(StudentsPerformance$race.ethnicity == "group D"))
Ge <- length(which(StudentsPerformance$race.ethnicity == "group E"))
slices <- c(Ga, Gb, Gc, Gd, Ge)
lbls <- c("GroupA", "GroupB", "GroupC", "GroupD", "GroupE")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents values to labels
lbls <- paste(lbls,"%",sep="") # add "%" to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
main="Pie Chart showing precent of passes by Race/Ethnic Group")
data$AverageGrade = rowMeans(data[,c(6,7,8)])
greatAvg <- data %>% filter(AverageGrade >= 80)
finalavgrade <- select(greatAvg, gender,AverageGrade)
nofinalavgrade <- length(greatAvg$AverageGrade)
nofinalavgrade
## [1] 198
head(finalavgrade)
## gender AverageGrade
## 1 female 82.33333
## 2 female 92.66667
## 3 female 91.66667
## 4 male 87.66667
## 5 male 88.66667
## 6 male 80.33333
readingavg <- mean(data$reading.score)
newdata <- data %>% filter(reading.score > readingavg)
gendercount <- c(newdata$gender)
table(gendercount)
## gendercount
## 1 2
## 318 195
barplot(table(gendercount),
main="Comparsion of Gender with grades are above Class Average",
names.arg=c("Female","Male"),
xlab="Gender",
ylab="Total Number",
border="red",
col.axis="red",
col="blue",
density=10
)
boxplot(data$writing.score,data$reading.score,
horizontal = TRUE,
col = c("Blue","red"),
main = "Boxplot showing how writing and reading grades are outstretched ",
at = c(1,2),
names = c("Math Score", "Writing Score")
)
testcompleted <- filter(data, test.preparation.course == "completed", )
plot_num(testcompleted)
list <- data %>%
select(gender, parental.level.of.education, lunch, writing.score,) %>%
filter(parental.level.of.education == "master's degree" & lunch == "free/reduced" & writing.score >=90)
count(list)
## # A tibble: 1 x 1
## n
## <int>
## 1 3
list
## gender parental.level.of.education lunch writing.score
## 1 female master's degree free/reduced 100
## 2 female master's degree free/reduced 94
## 3 male master's degree free/reduced 90
data$parental.level.of.education <- as.numeric(data$parental.level.of.education)
#data$AverageGrade = rowMeans(data[,c(6,7,8)])
head(data)
## gender race.ethnicity parental.level.of.education lunch
## 1 female group B 2 standard
## 2 female group C 5 standard
## 3 female group B 4 standard
## 4 male group A 1 free/reduced
## 5 male group C 5 standard
## 6 female group B 1 standard
## test.preparation.course math.score reading.score writing.score AverageGrade
## 1 none 72 72 74 72.66667
## 2 completed 69 90 88 82.33333
## 3 none 90 95 93 92.66667
## 4 none 47 57 44 49.33333
## 5 none 76 78 75 76.33333
## 6 none 71 83 78 77.33333
set.seed(200)
h <- runif(nrow(data))
datasample <- data[order(h),]
sampleSize <- floor(.8*nrow(data))
indexes <- sample(seq_len(nrow(data)), sampleSize, replace = FALSE)
train <-datasample[1:800,]
test <- datasample[801:1000,]
head(train)
## gender race.ethnicity parental.level.of.education lunch
## 228 male group D 3 standard
## 811 male group A 6 standard
## 108 male group E 1 standard
## 839 male group B 1 free/reduced
## 523 male group D 2 standard
## 80 female group E 4 standard
## test.preparation.course math.score reading.score writing.score AverageGrade
## 228 none 57 50 54 53.66667
## 811 none 51 31 36 39.33333
## 108 completed 66 63 64 64.33333
## 839 completed 58 57 53 56.00000
## 523 none 69 58 57 61.33333
## 80 none 62 68 68 66.00000
head(test)
## gender race.ethnicity parental.level.of.education lunch
## 383 male group C 4 free/reduced
## 410 male group D 1 standard
## 259 female group B 5 standard
## 217 female group E 1 free/reduced
## 579 female group B 5 free/reduced
## 561 female group D 5 standard
## test.preparation.course math.score reading.score writing.score AverageGrade
## 383 none 79 81 71 77.00000
## 410 completed 87 84 85 85.33333
## 259 none 70 75 78 74.33333
## 217 completed 83 86 88 85.66667
## 579 completed 48 56 58 54.00000
## 561 completed 74 75 79 76.00000
#MOdel 1 - simple linear model
model_1 <- lm(AverageGrade~ reading.score, data = train)
summary(model_1)
##
## Call:
## lm(formula = AverageGrade ~ reading.score, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9857 -2.2966 0.0069 2.2735 9.9671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.134737 0.587758 3.632 0.000299 ***
## reading.score 0.948377 0.008318 114.018 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.476 on 798 degrees of freedom
## Multiple R-squared: 0.9422, Adjusted R-squared: 0.9421
## F-statistic: 1.3e+04 on 1 and 798 DF, p-value: < 2.2e-16
#MOdel 2 - Multiple linear model
model_2 <- lm(AverageGrade~ reading.score + writing.score, data = train)
summary(model_2)
##
## Call:
## lm(formula = AverageGrade ~ reading.score + writing.score, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9061 -2.0188 0.1133 1.9873 8.3333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.44107 0.49184 4.963 8.49e-07 ***
## reading.score 0.52365 0.02394 21.876 < 2e-16 ***
## writing.score 0.42722 0.02304 18.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.907 on 797 degrees of freedom
## Multiple R-squared: 0.9596, Adjusted R-squared: 0.9595
## F-statistic: 9465 on 2 and 797 DF, p-value: < 2.2e-16
#MOdel 3 - Multiple linear model
model_3 <- lm(AverageGrade~math.score + writing.score + parental.level.of.education , data = train)
summary(model_3)
##
## Call:
## lm(formula = AverageGrade ~ math.score + writing.score + parental.level.of.education,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6616 -0.9218 0.0197 0.9613 3.9405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.572084 0.257050 6.116 1.5e-09 ***
## math.score 0.376409 0.005398 69.725 < 2e-16 ***
## writing.score 0.605836 0.005396 112.280 < 2e-16 ***
## parental.level.of.education 0.027913 0.026966 1.035 0.301
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.38 on 796 degrees of freedom
## Multiple R-squared: 0.9909, Adjusted R-squared: 0.9909
## F-statistic: 2.891e+04 on 3 and 796 DF, p-value: < 2.2e-16
AIC(model_1)
## [1] 4267.6
AIC(model_2)
## [1] 3982.646
AIC(model_3)
## [1] 2791.758
BIC(model_1)
## [1] 4281.654
BIC(model_2)
## [1] 4001.384
BIC(model_3)
## [1] 2815.182
mathscorepred<-predict(model_3,test)
actuals_preds <- data.frame(cbind(Avggrade=test$AverageGrade, PredictedScore=mathscorepred))
head(actuals_preds)
## Avggrade PredictedScore
## 383 77.00000 74.43436
## 410 85.33333 85.84359
## 259 74.33333 75.31544
## 217 85.66667 86.15546
## 579 54.00000 54.91773
## 561 76.00000 77.42691
cor.test(actuals_preds$Avggrade, actuals_preds$PredictedScore, method="pearson")
##
## Pearson's product-moment correlation
##
## data: actuals_preds$Avggrade and actuals_preds$PredictedScore
## t = 132.15, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9925745 0.9957456
## sample estimates:
## cor
## 0.9943788