rm(list = ls())
#setwd("C:/Users/nghimire/OneDrive - The University of Texas at Tyler/Overton Data Summer 2022")
library(tidyverse) # To manipulate dataframe
library(ggplot2) # Just in case, we need to visualize something
library(readxl) # To read data from excel spreadsheet
library(haven) # To read spss data

1. Getting 2021 Data Ready

A. Loading Checking the Names of Variables

# Loading Data
data_2021 <- read_spss("Overton Summer 2021 School Data.sav")
data_2021 <- data.frame(data_2021)
# Checking the Variable Names
names(data_2021)
  [1] "LastName"                 "FirstName"               
  [3] "Gender"                   "Ethnicity"               
  [5] "Grade"                    "Teacher"                 
  [7] "Attendance"               "Wordsread"               
  [9] "MARSIGRSPre"              "MARSIGRSPost"            
 [11] "MARSIPSSPre"              "MARSIPSSPost"            
 [13] "MARSISRSPre"              "MARSISRSPost"            
 [15] "MARSITotalPre"            "MARSITotalPost"          
 [17] "GRADEWRRawPre"            "GRADEWRRawPost"          
 [19] "GRADEWRStaninePre"        "GRADEWRStaninePost"      
 [21] "GRADEWRGEPre"             "GRADEWRGEPost"           
 [23] "GRADEWRSSPre"             "GRADEWRSSPost"           
 [25] "GRADEWRNCEPre"            "GRADEWRNCEPost"          
 [27] "GRADEWRGSVPre"            "GRADEWRGSVPost"          
 [29] "GRADEVocRSPre"            "GRADEVocRSPost"          
 [31] "GRADEVocStaninePre"       "GRADEVocStaninePost"     
 [33] "GRADEVocGEPre"            "GRADEVocGEPost"          
 [35] "GRADEVocSSPre"            "GRADEVocSSPost"          
 [37] "GRADEVocNCEPre"           "GRADEVocNCEPost"         
 [39] "GRADEVocGSVPre"           "GRADEVocGSVPost"         
 [41] "GRADEVocCompRSPre"        "GRADEVocCompRSPost"      
 [43] "GRADEVocCompStaninePre"   "GRADEVocCompStaninePost" 
 [45] "GRADEVocCompGEPre"        "GRADEVocCompGEPost"      
 [47] "GRADEVocCompSSPre"        "GRADEVocCompSSPost"      
 [49] "GRADEVocCompNCEPre"       "GRADEVocCompNCEPost"     
 [51] "GRADEVocCompGSVPre"       "GRADEVocCompGSVPost"     
 [53] "GRADESenCompRSPre"        "GRADESenCompRSPost"      
 [55] "GRADESenCompStaninePre"   "GRADESenCompStaninePost" 
 [57] "GRADESenCompGEPre"        "GRADESenCompGEPost"      
 [59] "GRADESenCompSSPre"        "GRADESenCompSSPost"      
 [61] "GRADESenCompNCEPre"       "GRADESenCompNCEPost"     
 [63] "GRADESenCompGSVPre"       "GRADESenCompGSVPost"     
 [65] "GRADEPassCompRSPre"       "GRADEPassCompRSPost"     
 [67] "GRADEPassCompStaninePre"  "GRADEPassCompStaninePost"
 [69] "GRADEPassCompGEPre"       "GRADEPassCompGEPost"     
 [71] "GRADEPassCompSSPre"       "GRADEPassCompSSPost"     
 [73] "GRADEPassCompNCEPre"      "GRADEPassCompNCEPost"    
 [75] "GRADEPassCompGSVPre"      "GRADEPassCompGSVPost"    
 [77] "GRADECompCompRSPre"       "GRADECompCompRSPost"     
 [79] "GRADECompCompStaninePre"  "GRADECompCompStaninePost"
 [81] "GRADECompCompGEPre"       "GRADECompCompGEPost"     
 [83] "GRADECompCompSSPre"       "GRADECompCompSSPost"     
 [85] "GRADECompCompNCEPre"      "GRADECompCompNCEPost"    
 [87] "GRADECompCompGSVPre"      "GRADECompCompGSVPost"    
 [89] "GRADETotalTRSPre"         "GRADETotalTRSPost"       
 [91] "GRADETotalTStaninePre"    "GRADETotalTStaninePost"  
 [93] "GRADETotalTGEPre"         "GRADETotalTGEPost"       
 [95] "GRADETotalTSSPre"         "GRADETotalTSSPost"       
 [97] "GRADETotalTNCEPre"        "GRADETotalTNCEPost"      
 [99] "GRADETotalTGSVPre"        "GRADETotalTGSVPost"      
[101] "GRADEListCompRSPre"       "GRADEListCompRSPost"     
[103] "GRADEListCompStaninePre"  "GRADEListCompStaninePost"
[105] "GRADEListCompGEPre"       "GRADEListCompGEPost"     
[107] "GRADEListCompSSPre"       "GRADEListCompSSPost"     
[109] "GRADEListCompNCEPre"      "GRADEListCompNCEPost"    
[111] "GRADEListCompGSVPre"      "GRADEListCompGSVPost"    

B. Subsetting Data

short_2021 <- select(data_2021, LastName, Gender, Grade, Teacher, Attendance, Wordsread, MARSITotalPre, MARSITotalPost, GRADETotalTGEPre, GRADETotalTGEPost, GRADETotalTNCEPre, GRADETotalTNCEPost, GRADETotalTGSVPre, GRADETotalTGSVPost, GRADEListCompRSPre, GRADEListCompRSPost)
# Making Sure
names(short_2021)
 [1] "LastName"            "Gender"              "Grade"              
 [4] "Teacher"             "Attendance"          "Wordsread"          
 [7] "MARSITotalPre"       "MARSITotalPost"      "GRADETotalTGEPre"   
[10] "GRADETotalTGEPost"   "GRADETotalTNCEPre"   "GRADETotalTNCEPost" 
[13] "GRADETotalTGSVPre"   "GRADETotalTGSVPost"  "GRADEListCompRSPre" 
[16] "GRADEListCompRSPost"

C. Renaming the Column Names

colnames(short_2021) <- c("student", "gender", "grade", "tech_er", "attendance", "words_read", "pre_marsi_avg", "post_marsi_avg", "pre_tt_ge", "post_tt_ge", "pre_tt_nce", "post_tt_nce", "pre_tt_gsv", "post_tt_gsv", "listening_pre", "listening_post")
#short_2021$gender <- as.factor(short_2021$gender)
#short_2021$grade <- as.factor(short_2021$grade)
#short_2021$tech_er <- as.factor(short_2021$tech_er)

str(short_2021)
'data.frame':   20 obs. of  16 variables:
 $ student       : dbl+lbl [1:20]  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,...
   ..@ label        : chr "Last Name"
   ..@ format.spss  : chr "F8.2"
   ..@ display_width: int 11
   ..@ labels       : Named num  1 2 3 4 5 6 7 8 9 10 ...
   .. ..- attr(*, "names")= chr [1:20] "Bobbitt" "Boney" "Devora" "Holland" ...
 $ gender        : dbl+lbl [1:20] 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2
   ..@ label        : chr "Gender"
   ..@ format.spss  : chr "F8.2"
   ..@ display_width: int 11
   ..@ labels       : Named num  1 2
   .. ..- attr(*, "names")= chr [1:2] "Male" "Female"
 $ grade         : dbl+lbl [1:20] 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2
   ..@ label        : chr "Grade"
   ..@ format.spss  : chr "F8.2"
   ..@ display_width: int 15
   ..@ labels       : Named num  1 2 3
   .. ..- attr(*, "names")= chr [1:3] "Third Grade" "Fourth Grade" "Fifth Grade"
 $ tech_er       : dbl+lbl [1:20] 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 4, 4, 4, 3, 4, 3, 4, 4, 2
   ..@ label        : chr "Teacher"
   ..@ format.spss  : chr "F8.2"
   ..@ display_width: int 19
   ..@ labels       : Named num  1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "Maryanna Luna" "Ceara Oliver" "Jacie Gunter" "Christy Turner"
 $ attendance    : num  16 18 11 17 14 15 17 10 15 18 ...
  ..- attr(*, "label")= chr "Days Attended"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 11
 $ words_read    : num  51858 28575 49406 38867 33160 ...
  ..- attr(*, "label")= chr "Words Read"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 17
 $ pre_marsi_avg : num  2.73 2.6 3.2 3 3.8 3.8 3 2.6 3.86 2.73 ...
  ..- attr(*, "label")= chr "MARSI Composite Score Pretest"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 15
 $ post_marsi_avg: num  3 4.2 NA 3.2 3.46 3.73 3.2 NA 5 3.6 ...
  ..- attr(*, "label")= chr "MARSI Composite Score Post"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 19
 $ pre_tt_ge     : num  2.4 2.3 2 1.8 2.1 2.7 3.7 2.8 3.7 3.5 ...
  ..- attr(*, "label")= chr "GRADE Total Test Grade Equivalent Score Pre"
  ..- attr(*, "format.spss")= chr "F8.2"
 $ post_tt_ge    : num  2 3.1 NA 1.8 2.2 NA 3.8 NA 3.3 2.8 ...
  ..- attr(*, "label")= chr "GRADE Total Test Grade Equivalent Score Post"
  ..- attr(*, "format.spss")= chr "F8.2"
 $ pre_tt_nce    : num  39 36 28 21 32 44 37 36 37 36 ...
  ..- attr(*, "label")= chr "GRADE Total Test National Curve Equivalent Pre"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 21
 $ post_tt_nce   : num  28 50 NA 19 35 NA 47 NA 43 36 ...
  ..- attr(*, "label")= chr "GRADE Total Test National Curve Equivalent Post"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 21
 $ pre_tt_gsv    : num  406 401 385 375 394 415 434 417 434 430 ...
  ..- attr(*, "label")= chr "GRADE Total Test Growth Scale Value Pre"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 22
 $ post_tt_gsv   : num  387 423 NA 372 398 NA 436 NA 428 418 ...
  ..- attr(*, "label")= chr "GRADE Total Test Growth Scale Value Post"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 21
 $ listening_pre : num  15 13 15 12 12 16 16 14 17 15 ...
  ..- attr(*, "label")= chr "GRADE Listening Comprehension Raw Score Pre"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 26
 $ listening_post: num  13 12 NA 11 11 12 12 NA 13 9 ...
  ..- attr(*, "label")= chr "GRADE Listening Comprehension Raw Score Post"
  ..- attr(*, "format.spss")= chr "F8.2"
  ..- attr(*, "display_width")= int 30

D. Checking the Existing Level and Labels in Some categorial Data Removing Them

# Loading Required Library
library(sjlabelled)
# Checking the Labels in the Entire Dataset
    #get_labels(short_2021[ ,1:4])
# Getting Rid of the Labels
short_2021$student <- remove_labels(short_2021$student, labels = c(1:20)) # There were 20 lebels, i.e., names of the students and getting rid of all of them
# Checking back
get_labels(short_2021$student)
NULL

E. Checking the Labels of Other Categorical Variables

get_labels(short_2021[ ,2:4])
$gender
[1] "Male"   "Female"

$grade
[1] "Third Grade"  "Fourth Grade" "Fifth Grade" 

$tech_er
[1] "Maryanna Luna"  "Ceara Oliver"   "Jacie Gunter"   "Christy Turner"

E. Replacing the Old Labels by Something New

short_2021$gender <- replace_labels(short_2021$gender, 
                                    labels = c(
                                      "Male" = "1",
                                      "Female" = "2"
                                    ))
short_2021$grade <- replace_labels(short_2021$grade, 
                                    labels = c(
                                      "Second Grade" = "1",
                                      "Third Grade" = "2",
                                      "Fourth Grade" = "3"
                                    ))
short_2021$tech_er <- replace_labels(short_2021$tech_er, 
                                    labels = c(
                                      "LM" = "1",
                                      "OC" = "2",
                                      "TC" = "3",
                                      "GJ" = "4"
                                    ))
get_labels(short_2021[ ,2:4])
$gender
[1] "Male"   "Female"

$grade
[1] "Second Grade" "Third Grade"  "Fourth Grade"

$tech_er
[1] "LM" "OC" "TC" "GJ"

F. Adding a New Column and Checking All Variables

short_2021$year <- c(0)
summary(short_2021)
    student          gender        grade        tech_er      attendance   
 Min.   : 1.00   Min.   :1.0   Min.   :1.0   Min.   :1.0   Min.   : 1.00  
 1st Qu.: 5.75   1st Qu.:1.0   1st Qu.:1.0   1st Qu.:1.0   1st Qu.:10.75  
 Median :10.50   Median :2.0   Median :2.0   Median :2.0   Median :15.50  
 Mean   :10.50   Mean   :1.7   Mean   :2.1   Mean   :2.4   Mean   :13.10  
 3rd Qu.:15.25   3rd Qu.:2.0   3rd Qu.:3.0   3rd Qu.:4.0   3rd Qu.:17.00  
 Max.   :20.00   Max.   :2.0   Max.   :3.0   Max.   :4.0   Max.   :18.00  
                                                                          
   words_read     pre_marsi_avg   post_marsi_avg    pre_tt_ge    
 Min.   : 16631   Min.   :2.600   Min.   :3.000   Min.   :1.800  
 1st Qu.: 24946   1st Qu.:2.865   1st Qu.:3.400   1st Qu.:2.450  
 Median : 33160   Median :3.730   Median :3.730   Median :2.900  
 Mean   : 40343   Mean   :3.429   Mean   :3.813   Mean   :3.261  
 3rd Qu.: 48778   3rd Qu.:3.860   3rd Qu.:4.200   3rd Qu.:3.700  
 Max.   :108885   Max.   :4.260   Max.   :5.000   Max.   :6.200  
 NA's   :5        NA's   :1       NA's   :7       NA's   :2      
   post_tt_ge      pre_tt_nce     post_tt_nce      pre_tt_gsv     post_tt_gsv 
 Min.   :1.800   Min.   :21.00   Min.   :19.00   Min.   :375.0   Min.   :372  
 1st Qu.:2.800   1st Qu.:28.00   1st Qu.:35.00   1st Qu.:407.5   1st Qu.:418  
 Median :3.300   Median :36.00   Median :37.00   Median :419.5   Median :428  
 Mean   :3.308   Mean   :35.94   Mean   :38.38   Mean   :421.1   Mean   :422  
 3rd Qu.:3.800   3rd Qu.:42.75   3rd Qu.:43.00   3rd Qu.:434.0   3rd Qu.:436  
 Max.   :6.400   Max.   :56.00   Max.   :57.00   Max.   :466.0   Max.   :467  
 NA's   :7       NA's   :2       NA's   :7       NA's   :2       NA's   :7    
 listening_pre   listening_post       year  
 Min.   : 8.00   Min.   : 9.00   Min.   :0  
 1st Qu.:13.00   1st Qu.:11.00   1st Qu.:0  
 Median :15.00   Median :12.00   Median :0  
 Mean   :14.05   Mean   :12.36   Mean   :0  
 3rd Qu.:15.50   3rd Qu.:13.00   3rd Qu.:0  
 Max.   :17.00   Max.   :16.00   Max.   :0  
 NA's   :1       NA's   :6                  

Thereโ€™s considerable number of missing data in all cases.

2. Getting 2022 Data

A. Loading Checking the Names of Variables

# Loading Data
data_2022 <- read_spss("reading_data_2022.sav")
data_2022 <- data.frame(data_2022)
# Checking the Variable Names
names(data_2022)
 [1] "student_name"               "student"                   
 [3] "year"                       "year_coded"                
 [5] "grade"                      "gender"                    
 [7] "teacher"                    "tech_er"                   
 [9] "words_read"                 "pre_marsi_grs"             
[11] "pre_marsi_pss"              "pre_marsi_srs"             
[13] "pre_marsi_avg"              "posttest_vocab_raw"        
[15] "posttest_vocab_stanine"     "posttest_vocab_ile"        
[17] "posttest_vocab_ge"          "posttest_vocab_ss"         
[19] "posttest_vocab_nce"         "posttest_cc_raw"           
[21] "posttest_cc_stanine"        "posttest_cc_ile"           
[23] "posttest_cc_ge"             "posttest_cc_ss"            
[25] "posttest_cc_nce"            "posttest_listening_comp"   
[27] "posttest_listening_stanine" "posttest_raw_total"        
[29] "posttest_tt_stanine"        "posttest_tt_ile"           
[31] "posttest_tt_ge"             "posttest_tt_ss"            
[33] "posttest_tt_nce"            "posttest_tt_gsv"           
[35] "post_marsi_grs"             "post_marsi_pss"            
[37] "post_marsi_srs"             "post_marsi_avg"            
[39] "pretest_vocab_raw"          "pretest_vocab_stanine"     
[41] "pretest_vocab_ile"          "pretest_vocab_ge"          
[43] "pretest_vocab_ss"           "pretest_vocab_nce"         
[45] "pretest_cc_raw"             "pretest_cc_stanine"        
[47] "pretest_cc_ile"             "pretest_cc_ge"             
[49] "pretest_cc_ss"              "pretest_cc_nce"            
[51] "pretest_listening_comp"     "pretest_listening_stanine" 
[53] "pretest_raw_total"          "pretest_tt_stanine"        
[55] "pretest_tt_ile"             "pretest_tt_ge"             
[57] "pretest_tt_ss"              "pretest_tt_nce"            
[59] "pretest_tt_gsv"            

B. Doing Everything Needed Like 2021 Data

# Selecting Required Columns
short_2022 <- select(data_2022, gender, student, grade, tech_er, words_read, pre_marsi_avg, pretest_tt_ge, posttest_tt_ge, pretest_tt_nce, posttest_tt_nce, pretest_tt_gsv, posttest_tt_gsv, pretest_listening_stanine, posttest_listening_stanine, year_coded)

# Renaming Columns to match with the columns in 2021 data
short_2022 <- short_2022|>
  rename(year = year_coded,
         listening_pre = pretest_listening_stanine,
         listening_post = posttest_listening_stanine,
         pre_tt_ge  = pretest_tt_ge,
         post_tt_ge  = posttest_tt_ge, 
         pre_tt_nce = pretest_tt_nce, 
         post_tt_nce = posttest_tt_nce, 
         pre_tt_gsv = pretest_tt_gsv, 
         post_tt_gsv = posttest_tt_gsv
  )

# Checking the new list of included variables
names(short_2022)
 [1] "gender"         "student"        "grade"          "tech_er"       
 [5] "words_read"     "pre_marsi_avg"  "pre_tt_ge"      "post_tt_ge"    
 [9] "pre_tt_nce"     "post_tt_nce"    "pre_tt_gsv"     "post_tt_gsv"   
[13] "listening_pre"  "listening_post" "year"          
# Changing the class of some of the variables 
short_2022$grade <- as.factor(short_2022$grade)
#short_2022$gender <- as.factor(short_2022$gender)
#short_2022$tech_er <- as.factor(short_2022$tech_er)

# cheking for the labels of the selected variables
get_labels(short_2022[, 3:5])
$grade
[1] "1" "2" "3" "4"

$tech_er
[1] "."  "MC" "WM" "TC" "FJ"

$words_read
NULL
# Having a look at the data
  #short_2022

# Changing the labels by new names
short_2022$grade <- replace_labels(short_2022$grade, 
                                    labels = c(
                                      "Second Grade" = "1",
                                      "Third Grade" = "2",
                                      "Fourth Grade" = "3",
                                      "Fifth Grade" = "4"
                                    ))

#Changing a dot into an NA in gender and tech_er variables
is.na(short_2022$gender) <- short_2022$gender== "."
is.na(short_2022$tech_er) <- short_2022$tech_er == "."

# Checking if that worked
get_labels(short_2022[, 3:5])
$grade
[1] "Second Grade" "Third Grade"  "Fourth Grade" "Fifth Grade" 

$tech_er
[1] NA   "MC" "WM" "TC" "FJ"

$words_read
NULL

C. Getting 2021 & 2022 data ready to merge

# Comparing variable names
library(waldo)
print(compare(names(short_2021), names(short_2022)), n = 25)
    old              | new                           
[1] "student"        -                               
[2] "gender"         | "gender"        [1]           
                     - "student"       [2]           
[3] "grade"          | "grade"         [3]           
[4] "tech_er"        | "tech_er"       [4]           
[5] "attendance"     -                               
[6] "words_read"     | "words_read"    [5]           
[7] "pre_marsi_avg"  | "pre_marsi_avg" [6]           
[8] "post_marsi_avg" -                               
[9] "pre_tt_ge"      | "pre_tt_ge"     [7]           
... ...                ...             and 2 more ...

D. Merging 2021 and 2022 dataset using rbind.fill() function from the {plyr} package

library(plyr)
merged_data <- rbind.fill(short_2021, short_2022)
summary(merged_data)
    student         gender              grade         tech_er         
 Min.   : 1.00   Length:39          Min.   :1.000   Length:39         
 1st Qu.: 5.50   Class :character   1st Qu.:1.000   Class :character  
 Median :10.00   Mode  :character   Median :2.000   Mode  :character  
 Mean   :10.26                      Mean   :2.308                     
 3rd Qu.:15.00                      3rd Qu.:3.000                     
 Max.   :20.00                      Max.   :4.000                     
                                                                      
   attendance      words_read     pre_marsi_avg   post_marsi_avg 
 Min.   : 1.00   Min.   :  3165   Min.   :1.530   Min.   :3.000  
 1st Qu.:10.75   1st Qu.: 19620   1st Qu.:2.783   1st Qu.:3.400  
 Median :15.50   Median : 30453   Median :3.365   Median :3.730  
 Mean   :13.10   Mean   : 35590   Mean   :3.380   Mean   :3.813  
 3rd Qu.:17.00   3rd Qu.: 48778   3rd Qu.:3.877   3rd Qu.:4.200  
 Max.   :18.00   Max.   :108885   Max.   :4.870   Max.   :5.000  
 NA's   :19      NA's   :8        NA's   :3       NA's   :26     
   pre_tt_ge       post_tt_ge      pre_tt_nce     post_tt_nce   
 Min.   :0.800   Min.   :0.900   Min.   :15.00   Min.   :19.00  
 1st Qu.:2.100   1st Qu.:2.200   1st Qu.:30.50   1st Qu.:37.00  
 Median :2.750   Median :3.100   Median :36.50   Median :46.00  
 Mean   :3.126   Mean   :3.183   Mean   :40.91   Mean   :48.28  
 3rd Qu.:3.700   3rd Qu.:3.900   3rd Qu.:49.00   3rd Qu.:53.00  
 Max.   :6.700   Max.   :6.400   Max.   :88.00   Max.   :84.00  
 NA's   :5       NA's   :10      NA's   :5       NA's   :10     
   pre_tt_gsv     post_tt_gsv    listening_pre  listening_post      year       
 Min.   :314.0   Min.   :319.0   Min.   : 2.0   Min.   : 1.0   Min.   :0.0000  
 1st Qu.:394.5   1st Qu.:399.0   1st Qu.: 4.5   1st Qu.: 5.0   1st Qu.:0.0000  
 Median :416.0   Median :423.0   Median :12.0   Median : 9.0   Median :0.0000  
 Mean   :414.7   Mean   :417.2   Mean   : 9.8   Mean   : 8.6   Mean   :0.4872  
 3rd Qu.:434.0   3rd Qu.:438.0   3rd Qu.:15.0   3rd Qu.:12.0   3rd Qu.:1.0000  
 Max.   :470.0   Max.   :467.0   Max.   :17.0   Max.   :16.0   Max.   :1.0000  
 NA's   :5       NA's   :10      NA's   :4      NA's   :9                      
#write.csv(merged_data, "overton_merged_data.csv")