rm(list = ls())
#setwd("C:/Users/nghimire/OneDrive - The University of Texas at Tyler/Overton Data Summer 2022")
library(tidyverse) # To manipulate dataframe
library(ggplot2) # Just in case, we need to visualize something
library(readxl) # To read data from excel spreadsheet
library(haven) # To read spss data
# Loading Data
data_2021 <- read_spss("Overton Summer 2021 School Data.sav")
data_2021 <- data.frame(data_2021)
# Checking the Variable Names
names(data_2021)
[1] "LastName" "FirstName"
[3] "Gender" "Ethnicity"
[5] "Grade" "Teacher"
[7] "Attendance" "Wordsread"
[9] "MARSIGRSPre" "MARSIGRSPost"
[11] "MARSIPSSPre" "MARSIPSSPost"
[13] "MARSISRSPre" "MARSISRSPost"
[15] "MARSITotalPre" "MARSITotalPost"
[17] "GRADEWRRawPre" "GRADEWRRawPost"
[19] "GRADEWRStaninePre" "GRADEWRStaninePost"
[21] "GRADEWRGEPre" "GRADEWRGEPost"
[23] "GRADEWRSSPre" "GRADEWRSSPost"
[25] "GRADEWRNCEPre" "GRADEWRNCEPost"
[27] "GRADEWRGSVPre" "GRADEWRGSVPost"
[29] "GRADEVocRSPre" "GRADEVocRSPost"
[31] "GRADEVocStaninePre" "GRADEVocStaninePost"
[33] "GRADEVocGEPre" "GRADEVocGEPost"
[35] "GRADEVocSSPre" "GRADEVocSSPost"
[37] "GRADEVocNCEPre" "GRADEVocNCEPost"
[39] "GRADEVocGSVPre" "GRADEVocGSVPost"
[41] "GRADEVocCompRSPre" "GRADEVocCompRSPost"
[43] "GRADEVocCompStaninePre" "GRADEVocCompStaninePost"
[45] "GRADEVocCompGEPre" "GRADEVocCompGEPost"
[47] "GRADEVocCompSSPre" "GRADEVocCompSSPost"
[49] "GRADEVocCompNCEPre" "GRADEVocCompNCEPost"
[51] "GRADEVocCompGSVPre" "GRADEVocCompGSVPost"
[53] "GRADESenCompRSPre" "GRADESenCompRSPost"
[55] "GRADESenCompStaninePre" "GRADESenCompStaninePost"
[57] "GRADESenCompGEPre" "GRADESenCompGEPost"
[59] "GRADESenCompSSPre" "GRADESenCompSSPost"
[61] "GRADESenCompNCEPre" "GRADESenCompNCEPost"
[63] "GRADESenCompGSVPre" "GRADESenCompGSVPost"
[65] "GRADEPassCompRSPre" "GRADEPassCompRSPost"
[67] "GRADEPassCompStaninePre" "GRADEPassCompStaninePost"
[69] "GRADEPassCompGEPre" "GRADEPassCompGEPost"
[71] "GRADEPassCompSSPre" "GRADEPassCompSSPost"
[73] "GRADEPassCompNCEPre" "GRADEPassCompNCEPost"
[75] "GRADEPassCompGSVPre" "GRADEPassCompGSVPost"
[77] "GRADECompCompRSPre" "GRADECompCompRSPost"
[79] "GRADECompCompStaninePre" "GRADECompCompStaninePost"
[81] "GRADECompCompGEPre" "GRADECompCompGEPost"
[83] "GRADECompCompSSPre" "GRADECompCompSSPost"
[85] "GRADECompCompNCEPre" "GRADECompCompNCEPost"
[87] "GRADECompCompGSVPre" "GRADECompCompGSVPost"
[89] "GRADETotalTRSPre" "GRADETotalTRSPost"
[91] "GRADETotalTStaninePre" "GRADETotalTStaninePost"
[93] "GRADETotalTGEPre" "GRADETotalTGEPost"
[95] "GRADETotalTSSPre" "GRADETotalTSSPost"
[97] "GRADETotalTNCEPre" "GRADETotalTNCEPost"
[99] "GRADETotalTGSVPre" "GRADETotalTGSVPost"
[101] "GRADEListCompRSPre" "GRADEListCompRSPost"
[103] "GRADEListCompStaninePre" "GRADEListCompStaninePost"
[105] "GRADEListCompGEPre" "GRADEListCompGEPost"
[107] "GRADEListCompSSPre" "GRADEListCompSSPost"
[109] "GRADEListCompNCEPre" "GRADEListCompNCEPost"
[111] "GRADEListCompGSVPre" "GRADEListCompGSVPost"
short_2021 <- select(data_2021, LastName, Gender, Grade, Teacher, Attendance, Wordsread, MARSITotalPre, MARSITotalPost, GRADETotalTGEPre, GRADETotalTGEPost, GRADETotalTNCEPre, GRADETotalTNCEPost, GRADETotalTGSVPre, GRADETotalTGSVPost, GRADEListCompRSPre, GRADEListCompRSPost)
# Making Sure
names(short_2021)
[1] "LastName" "Gender" "Grade"
[4] "Teacher" "Attendance" "Wordsread"
[7] "MARSITotalPre" "MARSITotalPost" "GRADETotalTGEPre"
[10] "GRADETotalTGEPost" "GRADETotalTNCEPre" "GRADETotalTNCEPost"
[13] "GRADETotalTGSVPre" "GRADETotalTGSVPost" "GRADEListCompRSPre"
[16] "GRADEListCompRSPost"
colnames(short_2021) <- c("student", "gender", "grade", "tech_er", "attendance", "words_read", "pre_marsi_avg", "post_marsi_avg", "pre_tt_ge", "post_tt_ge", "pre_tt_nce", "post_tt_nce", "pre_tt_gsv", "post_tt_gsv", "listening_pre", "listening_post")
#short_2021$gender <- as.factor(short_2021$gender)
#short_2021$grade <- as.factor(short_2021$grade)
#short_2021$tech_er <- as.factor(short_2021$tech_er)
str(short_2021)
'data.frame': 20 obs. of 16 variables:
$ student : dbl+lbl [1:20] 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,...
..@ label : chr "Last Name"
..@ format.spss : chr "F8.2"
..@ display_width: int 11
..@ labels : Named num 1 2 3 4 5 6 7 8 9 10 ...
.. ..- attr(*, "names")= chr [1:20] "Bobbitt" "Boney" "Devora" "Holland" ...
$ gender : dbl+lbl [1:20] 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2
..@ label : chr "Gender"
..@ format.spss : chr "F8.2"
..@ display_width: int 11
..@ labels : Named num 1 2
.. ..- attr(*, "names")= chr [1:2] "Male" "Female"
$ grade : dbl+lbl [1:20] 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2
..@ label : chr "Grade"
..@ format.spss : chr "F8.2"
..@ display_width: int 15
..@ labels : Named num 1 2 3
.. ..- attr(*, "names")= chr [1:3] "Third Grade" "Fourth Grade" "Fifth Grade"
$ tech_er : dbl+lbl [1:20] 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 4, 4, 3, 4, 3, 4, 4, 2
..@ label : chr "Teacher"
..@ format.spss : chr "F8.2"
..@ display_width: int 19
..@ labels : Named num 1 2 3 4
.. ..- attr(*, "names")= chr [1:4] "Maryanna Luna" "Ceara Oliver" "Jacie Gunter" "Christy Turner"
$ attendance : num 16 18 11 17 14 15 17 10 15 18 ...
..- attr(*, "label")= chr "Days Attended"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 11
$ words_read : num 51858 28575 49406 38867 33160 ...
..- attr(*, "label")= chr "Words Read"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 17
$ pre_marsi_avg : num 2.73 2.6 3.2 3 3.8 3.8 3 2.6 3.86 2.73 ...
..- attr(*, "label")= chr "MARSI Composite Score Pretest"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 15
$ post_marsi_avg: num 3 4.2 NA 3.2 3.46 3.73 3.2 NA 5 3.6 ...
..- attr(*, "label")= chr "MARSI Composite Score Post"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 19
$ pre_tt_ge : num 2.4 2.3 2 1.8 2.1 2.7 3.7 2.8 3.7 3.5 ...
..- attr(*, "label")= chr "GRADE Total Test Grade Equivalent Score Pre"
..- attr(*, "format.spss")= chr "F8.2"
$ post_tt_ge : num 2 3.1 NA 1.8 2.2 NA 3.8 NA 3.3 2.8 ...
..- attr(*, "label")= chr "GRADE Total Test Grade Equivalent Score Post"
..- attr(*, "format.spss")= chr "F8.2"
$ pre_tt_nce : num 39 36 28 21 32 44 37 36 37 36 ...
..- attr(*, "label")= chr "GRADE Total Test National Curve Equivalent Pre"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 21
$ post_tt_nce : num 28 50 NA 19 35 NA 47 NA 43 36 ...
..- attr(*, "label")= chr "GRADE Total Test National Curve Equivalent Post"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 21
$ pre_tt_gsv : num 406 401 385 375 394 415 434 417 434 430 ...
..- attr(*, "label")= chr "GRADE Total Test Growth Scale Value Pre"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 22
$ post_tt_gsv : num 387 423 NA 372 398 NA 436 NA 428 418 ...
..- attr(*, "label")= chr "GRADE Total Test Growth Scale Value Post"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 21
$ listening_pre : num 15 13 15 12 12 16 16 14 17 15 ...
..- attr(*, "label")= chr "GRADE Listening Comprehension Raw Score Pre"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 26
$ listening_post: num 13 12 NA 11 11 12 12 NA 13 9 ...
..- attr(*, "label")= chr "GRADE Listening Comprehension Raw Score Post"
..- attr(*, "format.spss")= chr "F8.2"
..- attr(*, "display_width")= int 30
# Loading Required Library
library(sjlabelled)
# Checking the Labels in the Entire Dataset
#get_labels(short_2021[ ,1:4])
# Getting Rid of the Labels
short_2021$student <- remove_labels(short_2021$student, labels = c(1:20)) # There were 20 lebels, i.e., names of the students and getting rid of all of them
# Checking back
get_labels(short_2021$student)
NULL
get_labels(short_2021[ ,2:4])
$gender
[1] "Male" "Female"
$grade
[1] "Third Grade" "Fourth Grade" "Fifth Grade"
$tech_er
[1] "Maryanna Luna" "Ceara Oliver" "Jacie Gunter" "Christy Turner"
short_2021$gender <- replace_labels(short_2021$gender,
labels = c(
"Male" = "1",
"Female" = "2"
))
short_2021$grade <- replace_labels(short_2021$grade,
labels = c(
"Second Grade" = "1",
"Third Grade" = "2",
"Fourth Grade" = "3"
))
short_2021$tech_er <- replace_labels(short_2021$tech_er,
labels = c(
"LM" = "1",
"OC" = "2",
"TC" = "3",
"GJ" = "4"
))
get_labels(short_2021[ ,2:4])
$gender
[1] "Male" "Female"
$grade
[1] "Second Grade" "Third Grade" "Fourth Grade"
$tech_er
[1] "LM" "OC" "TC" "GJ"
short_2021$year <- c(0)
summary(short_2021)
student gender grade tech_er attendance
Min. : 1.00 Min. :1.0 Min. :1.0 Min. :1.0 Min. : 1.00
1st Qu.: 5.75 1st Qu.:1.0 1st Qu.:1.0 1st Qu.:1.0 1st Qu.:10.75
Median :10.50 Median :2.0 Median :2.0 Median :2.0 Median :15.50
Mean :10.50 Mean :1.7 Mean :2.1 Mean :2.4 Mean :13.10
3rd Qu.:15.25 3rd Qu.:2.0 3rd Qu.:3.0 3rd Qu.:4.0 3rd Qu.:17.00
Max. :20.00 Max. :2.0 Max. :3.0 Max. :4.0 Max. :18.00
words_read pre_marsi_avg post_marsi_avg pre_tt_ge
Min. : 16631 Min. :2.600 Min. :3.000 Min. :1.800
1st Qu.: 24946 1st Qu.:2.865 1st Qu.:3.400 1st Qu.:2.450
Median : 33160 Median :3.730 Median :3.730 Median :2.900
Mean : 40343 Mean :3.429 Mean :3.813 Mean :3.261
3rd Qu.: 48778 3rd Qu.:3.860 3rd Qu.:4.200 3rd Qu.:3.700
Max. :108885 Max. :4.260 Max. :5.000 Max. :6.200
NA's :5 NA's :1 NA's :7 NA's :2
post_tt_ge pre_tt_nce post_tt_nce pre_tt_gsv post_tt_gsv
Min. :1.800 Min. :21.00 Min. :19.00 Min. :375.0 Min. :372
1st Qu.:2.800 1st Qu.:28.00 1st Qu.:35.00 1st Qu.:407.5 1st Qu.:418
Median :3.300 Median :36.00 Median :37.00 Median :419.5 Median :428
Mean :3.308 Mean :35.94 Mean :38.38 Mean :421.1 Mean :422
3rd Qu.:3.800 3rd Qu.:42.75 3rd Qu.:43.00 3rd Qu.:434.0 3rd Qu.:436
Max. :6.400 Max. :56.00 Max. :57.00 Max. :466.0 Max. :467
NA's :7 NA's :2 NA's :7 NA's :2 NA's :7
listening_pre listening_post year
Min. : 8.00 Min. : 9.00 Min. :0
1st Qu.:13.00 1st Qu.:11.00 1st Qu.:0
Median :15.00 Median :12.00 Median :0
Mean :14.05 Mean :12.36 Mean :0
3rd Qu.:15.50 3rd Qu.:13.00 3rd Qu.:0
Max. :17.00 Max. :16.00 Max. :0
NA's :1 NA's :6
Thereโs considerable number of missing data in all cases.
# Loading Data
data_2022 <- read_spss("reading_data_2022.sav")
data_2022 <- data.frame(data_2022)
# Checking the Variable Names
names(data_2022)
[1] "student_name" "student"
[3] "year" "year_coded"
[5] "grade" "gender"
[7] "teacher" "tech_er"
[9] "words_read" "pre_marsi_grs"
[11] "pre_marsi_pss" "pre_marsi_srs"
[13] "pre_marsi_avg" "posttest_vocab_raw"
[15] "posttest_vocab_stanine" "posttest_vocab_ile"
[17] "posttest_vocab_ge" "posttest_vocab_ss"
[19] "posttest_vocab_nce" "posttest_cc_raw"
[21] "posttest_cc_stanine" "posttest_cc_ile"
[23] "posttest_cc_ge" "posttest_cc_ss"
[25] "posttest_cc_nce" "posttest_listening_comp"
[27] "posttest_listening_stanine" "posttest_raw_total"
[29] "posttest_tt_stanine" "posttest_tt_ile"
[31] "posttest_tt_ge" "posttest_tt_ss"
[33] "posttest_tt_nce" "posttest_tt_gsv"
[35] "post_marsi_grs" "post_marsi_pss"
[37] "post_marsi_srs" "post_marsi_avg"
[39] "pretest_vocab_raw" "pretest_vocab_stanine"
[41] "pretest_vocab_ile" "pretest_vocab_ge"
[43] "pretest_vocab_ss" "pretest_vocab_nce"
[45] "pretest_cc_raw" "pretest_cc_stanine"
[47] "pretest_cc_ile" "pretest_cc_ge"
[49] "pretest_cc_ss" "pretest_cc_nce"
[51] "pretest_listening_comp" "pretest_listening_stanine"
[53] "pretest_raw_total" "pretest_tt_stanine"
[55] "pretest_tt_ile" "pretest_tt_ge"
[57] "pretest_tt_ss" "pretest_tt_nce"
[59] "pretest_tt_gsv"
# Selecting Required Columns
short_2022 <- select(data_2022, gender, student, grade, tech_er, words_read, pre_marsi_avg, pretest_tt_ge, posttest_tt_ge, pretest_tt_nce, posttest_tt_nce, pretest_tt_gsv, posttest_tt_gsv, pretest_listening_stanine, posttest_listening_stanine, year_coded)
# Renaming Columns to match with the columns in 2021 data
short_2022 <- short_2022|>
rename(year = year_coded,
listening_pre = pretest_listening_stanine,
listening_post = posttest_listening_stanine,
pre_tt_ge = pretest_tt_ge,
post_tt_ge = posttest_tt_ge,
pre_tt_nce = pretest_tt_nce,
post_tt_nce = posttest_tt_nce,
pre_tt_gsv = pretest_tt_gsv,
post_tt_gsv = posttest_tt_gsv
)
# Checking the new list of included variables
names(short_2022)
[1] "gender" "student" "grade" "tech_er"
[5] "words_read" "pre_marsi_avg" "pre_tt_ge" "post_tt_ge"
[9] "pre_tt_nce" "post_tt_nce" "pre_tt_gsv" "post_tt_gsv"
[13] "listening_pre" "listening_post" "year"
# Changing the class of some of the variables
short_2022$grade <- as.factor(short_2022$grade)
#short_2022$gender <- as.factor(short_2022$gender)
#short_2022$tech_er <- as.factor(short_2022$tech_er)
# cheking for the labels of the selected variables
get_labels(short_2022[, 3:5])
$grade
[1] "1" "2" "3" "4"
$tech_er
[1] "." "MC" "WM" "TC" "FJ"
$words_read
NULL
# Having a look at the data
#short_2022
# Changing the labels by new names
short_2022$grade <- replace_labels(short_2022$grade,
labels = c(
"Second Grade" = "1",
"Third Grade" = "2",
"Fourth Grade" = "3",
"Fifth Grade" = "4"
))
#Changing a dot into an NA in gender and tech_er variables
is.na(short_2022$gender) <- short_2022$gender== "."
is.na(short_2022$tech_er) <- short_2022$tech_er == "."
# Checking if that worked
get_labels(short_2022[, 3:5])
$grade
[1] "Second Grade" "Third Grade" "Fourth Grade" "Fifth Grade"
$tech_er
[1] NA "MC" "WM" "TC" "FJ"
$words_read
NULL
# Comparing variable names
library(waldo)
print(compare(names(short_2021), names(short_2022)), n = 25)
old | new
[1] "student" -
[2] "gender" | "gender" [1]
- "student" [2]
[3] "grade" | "grade" [3]
[4] "tech_er" | "tech_er" [4]
[5] "attendance" -
[6] "words_read" | "words_read" [5]
[7] "pre_marsi_avg" | "pre_marsi_avg" [6]
[8] "post_marsi_avg" -
[9] "pre_tt_ge" | "pre_tt_ge" [7]
... ... ... and 2 more ...
rbind.fill()
function from the {plyr} packagelibrary(plyr)
merged_data <- rbind.fill(short_2021, short_2022)
summary(merged_data)
student gender grade tech_er
Min. : 1.00 Length:39 Min. :1.000 Length:39
1st Qu.: 5.50 Class :character 1st Qu.:1.000 Class :character
Median :10.00 Mode :character Median :2.000 Mode :character
Mean :10.26 Mean :2.308
3rd Qu.:15.00 3rd Qu.:3.000
Max. :20.00 Max. :4.000
attendance words_read pre_marsi_avg post_marsi_avg
Min. : 1.00 Min. : 3165 Min. :1.530 Min. :3.000
1st Qu.:10.75 1st Qu.: 19620 1st Qu.:2.783 1st Qu.:3.400
Median :15.50 Median : 30453 Median :3.365 Median :3.730
Mean :13.10 Mean : 35590 Mean :3.380 Mean :3.813
3rd Qu.:17.00 3rd Qu.: 48778 3rd Qu.:3.877 3rd Qu.:4.200
Max. :18.00 Max. :108885 Max. :4.870 Max. :5.000
NA's :19 NA's :8 NA's :3 NA's :26
pre_tt_ge post_tt_ge pre_tt_nce post_tt_nce
Min. :0.800 Min. :0.900 Min. :15.00 Min. :19.00
1st Qu.:2.100 1st Qu.:2.200 1st Qu.:30.50 1st Qu.:37.00
Median :2.750 Median :3.100 Median :36.50 Median :46.00
Mean :3.126 Mean :3.183 Mean :40.91 Mean :48.28
3rd Qu.:3.700 3rd Qu.:3.900 3rd Qu.:49.00 3rd Qu.:53.00
Max. :6.700 Max. :6.400 Max. :88.00 Max. :84.00
NA's :5 NA's :10 NA's :5 NA's :10
pre_tt_gsv post_tt_gsv listening_pre listening_post year
Min. :314.0 Min. :319.0 Min. : 2.0 Min. : 1.0 Min. :0.0000
1st Qu.:394.5 1st Qu.:399.0 1st Qu.: 4.5 1st Qu.: 5.0 1st Qu.:0.0000
Median :416.0 Median :423.0 Median :12.0 Median : 9.0 Median :0.0000
Mean :414.7 Mean :417.2 Mean : 9.8 Mean : 8.6 Mean :0.4872
3rd Qu.:434.0 3rd Qu.:438.0 3rd Qu.:15.0 3rd Qu.:12.0 3rd Qu.:1.0000
Max. :470.0 Max. :467.0 Max. :17.0 Max. :16.0 Max. :1.0000
NA's :5 NA's :10 NA's :4 NA's :9
#write.csv(merged_data, "overton_merged_data.csv")