To independently explore the provided data files, which include students’ performance on STAAR EOC for Algebra I and student demographic information. Merge the data sets, perform an exploratory analysis, and communicate my findings via PowerPoint slides. This assignment aims to evaluate my proficiency in data cleaning, analysis, and visualization techniques.
# Load necessary packages
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'forcats' was built under R version 4.1.3
## Warning: package 'lubridate' was built under R version 4.1.3
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v forcats 1.0.0 v readr 2.1.4
## v ggplot2 3.5.0 v stringr 1.5.1
## v lubridate 1.9.2 v tibble 3.2.1
## v purrr 1.0.2 v tidyr 1.3.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(purrr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(latexpdf)
##
## Attaching package: 'latexpdf'
##
## The following object is masked from 'package:GGally':
##
## wrap
## [1] 236 20
## tibble [236 x 20] (S3: tbl_df/tbl/data.frame)
## $ D_StudentID : num [1:236] 1415085 1578121 1424549 1549105 1557558 ...
## $ D_CampusName : chr [1:236] "Campus A" "Campus A" "Campus A" "Campus A" ...
## $ D_CampusID : num [1:236] 100 100 100 100 100 100 100 100 100 27 ...
## $ Sex_Code : chr [1:236] "M" "F" "F" "F" ...
## $ Date_of_Birth (MMDDYYYY) : num [1:236] 5022004 6022004 3192004 3062003 1162003 ...
## $ Ethnicity_Race_Category : chr [1:236] "H" "H" "H" "H" ...
## $ Economically_Disadvantaged_Code: num [1:236] 1 0 1 1 1 0 1 1 1 1 ...
## $ S_StudentID : num [1:236] 1415085 1578121 1424549 1549105 1557558 ...
## $ S_CampusName : chr [1:236] "Campus A" "Campus A" "Campus A" "Campus A" ...
## $ S_CampusID : num [1:236] 100 100 100 100 100 100 100 100 100 27 ...
## $ End_Of_Course_Code : chr [1:236] "A1" "A1" "A1" "A1" ...
## $ Score_Code : chr [1:236] "S" "S" "S" "S" ...
## $ Test_Version : chr [1:236] "A" "S" "S" "S" ...
## $ First_Time_Tester_Retester_info: chr [1:236] "F" "F" "F" "F" ...
## $ BLANK16 : logi [1:236] NA NA NA NA NA NA ...
## $ Raw_Score : num [1:236] 37 51 30 27 29 27 35 39 12 34 ...
## $ Scale_Score : num [1:236] 4156 5127 3888 3780 3851 ...
## $ Approach_Grade_Level : num [1:236] 1 1 1 1 1 1 1 1 0 1 ...
## $ Meets_Grade_Level : num [1:236] 1 1 0 0 0 0 1 1 0 1 ...
## $ Masters_Grade_Level : num [1:236] 0 1 0 0 0 0 0 0 0 0 ...
## D_StudentID D_CampusName D_CampusID Sex_Code
## Min. :1159459 Length:236 Min. : 6.00 Length:236
## 1st Qu.:1373464 Class :character 1st Qu.: 27.00 Class :character
## Median :1513117 Mode :character Median : 27.00 Mode :character
## Mean :1458300 Mean : 45.75
## 3rd Qu.:1557621 3rd Qu.: 27.00
## Max. :1629992 Max. :250.00
##
## Date_of_Birth (MMDDYYYY) Ethnicity_Race_Category
## Min. : 1012003 Length:236
## 1st Qu.: 3184504 Class :character
## Median : 7042004 Mode :character
## Mean : 6680012
## 3rd Qu.:10044503
## Max. :12302003
##
## Economically_Disadvantaged_Code S_StudentID S_CampusName
## Min. :0.0000 Min. :1159459 Length:236
## 1st Qu.:1.0000 1st Qu.:1373464 Class :character
## Median :1.0000 Median :1513117 Mode :character
## Mean :0.8814 Mean :1458300
## 3rd Qu.:1.0000 3rd Qu.:1557621
## Max. :1.0000 Max. :1629992
##
## S_CampusID End_Of_Course_Code Score_Code Test_Version
## Min. : 6.00 Length:236 Length:236 Length:236
## 1st Qu.: 27.00 Class :character Class :character Class :character
## Median : 27.00 Mode :character Mode :character Mode :character
## Mean : 45.75
## 3rd Qu.: 27.00
## Max. :250.00
##
## First_Time_Tester_Retester_info BLANK16 Raw_Score Scale_Score
## Length:236 Mode:logical Min. : 7.00 Min. :2886
## Class :character NA's:236 1st Qu.:19.00 1st Qu.:3500
## Mode :character Median :30.00 Median :3888
## Mean :29.69 Mean :3917
## 3rd Qu.:39.00 3rd Qu.:4242
## Max. :53.00 Max. :5640
## NA's :1 NA's :1
## Approach_Grade_Level Meets_Grade_Level Masters_Grade_Level
## Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000
## Median :1.0000 Median :0.000 Median :0.0000
## Mean :0.7034 Mean :0.428 Mean :0.2161
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000
##
## [1] "D_StudentID" "D_CampusName"
## [3] "D_CampusID" "Sex_Code"
## [5] "Date_of_Birth (MMDDYYYY)" "Ethnicity_Race_Category"
## [7] "Economically_Disadvantaged_Code" "S_StudentID"
## [9] "S_CampusName" "S_CampusID"
## [11] "End_Of_Course_Code" "Score_Code"
## [13] "Test_Version" "First_Time_Tester_Retester_info"
## [15] "BLANK16" "Raw_Score"
## [17] "Scale_Score" "Approach_Grade_Level"
## [19] "Meets_Grade_Level" "Masters_Grade_Level"
## [1] 238
## D_StudentID D_CampusName
## 0 0
## D_CampusID Sex_Code
## 0 0
## Date_of_Birth (MMDDYYYY) Ethnicity_Race_Category
## 0 0
## Economically_Disadvantaged_Code S_StudentID
## 0 0
## S_CampusName S_CampusID
## 0 0
## End_Of_Course_Code Score_Code
## 0 0
## Test_Version First_Time_Tester_Retester_info
## 0 0
## BLANK16 Raw_Score
## 236 1
## Scale_Score Approach_Grade_Level
## 1 0
## Meets_Grade_Level Masters_Grade_Level
## 0 0
## [1] "BLANK16" "Raw_Score" "Scale_Score"
## [1] 236
## [1] 235
## D_StudentID D_CampusName
## 0 0
## D_CampusID Sex_Code
## 0 0
## Date_of_Birth (MMDDYYYY) Ethnicity_Race_Category
## 0 0
## Economically_Disadvantaged_Code S_StudentID
## 0 0
## S_CampusName S_CampusID
## 0 0
## End_Of_Course_Code Score_Code
## 0 0
## Test_Version First_Time_Tester_Retester_info
## 0 0
## BLANK16 Raw_Score
## 235 0
## Scale_Score Approach_Grade_Level
## 0 0
## Meets_Grade_Level Masters_Grade_Level
## 0 0
## [1] "BLANK16"
## [1] 16
## [1] "D_StudentID" "D_CampusName"
## [3] "D_CampusID" "Sex_Code"
## [5] "Date_of_Birth (MMDDYYYY)" "Ethnicity_Race_Category"
## [7] "Economically_Disadvantaged_Code" "End_Of_Course_Code"
## [9] "Score_Code" "Test_Version"
## [11] "First_Time_Tester_Retester_info" "Raw_Score"
## [13] "Scale_Score" "Approach_Grade_Level"
## [15] "Meets_Grade_Level" "Masters_Grade_Level"
## [1] "D_StudentID" "D_CampusName"
## [3] "D_CampusID" "Sex_Code"
## [5] "Birthdate" "Ethnicity_Race_Category"
## [7] "Economically_Disadvantaged_Code" "End_Of_Course_Code"
## [9] "Score_Code" "Test_Version"
## [11] "First_Time_Tester_Retester_info" "Raw_Score"
## [13] "Scale_Score" "Approach_Grade_Level"
## [15] "Meets_Grade_Level" "Masters_Grade_Level"
## [1] "2004-05-02" "2004-06-02" "2004-03-19" "2003-03-06" "2003-01-16"
## [6] "2003-10-29" "2004-03-04" "2004-05-25" "2003-08-10" "2003-08-22"
## [11] "2003-09-09" "2004-03-09" "2003-06-16" "2004-03-10" "2003-11-11"
## [16] "2004-03-04" "2004-07-22" "2004-08-04" "2004-05-11" "2003-10-17"
## [21] "2002-09-17" "2004-07-14" "2003-12-20" "2004-04-26" "2002-12-11"
## [26] "2001-05-17" "2004-08-27" "2003-02-04" "2004-03-22" "2004-07-22"
## [31] "2003-12-19" "2003-08-03" "2004-02-22" "2003-11-11" "2002-11-28"
## [36] "2003-04-25" "2002-11-11" "2003-09-17" "2004-04-20" "2004-06-25"
## [41] "2004-06-25" "2003-12-02" "2004-08-24" "2002-08-18" "2002-02-06"
## [46] "2004-06-21" "2001-12-11" "2004-08-20" "2003-09-26" "2004-07-19"
## [51] "2003-11-26" "2003-10-11" "2003-11-30" "2003-09-24" "2004-05-22"
## [56] "2004-02-13" "2004-05-02" "2003-01-07" "2004-08-13" "2004-07-20"
## [61] "2003-02-21" "2003-09-04" "2003-09-13" "2001-05-12" "2002-08-26"
## [66] "2003-01-08" "2004-01-30" "2003-08-12" "2004-06-27" "2002-03-13"
## [71] "2003-08-23" "2004-01-13" "2004-01-05" "2004-08-27" "2004-05-10"
## [76] "2004-08-03" "2004-07-15" "2004-01-21" "2004-01-29" "2002-11-01"
## [81] "2003-06-30" "2002-03-13" "2002-06-28" "2004-02-18" "2003-12-30"
## [86] "2003-10-09" "2004-06-02" "2001-01-23" "2003-11-04" "2004-06-16"
## [91] "2003-08-04" "2004-04-09" "2003-10-03" "2003-07-04" "2002-10-21"
## [96] "2002-10-09" "2002-10-11" "2002-09-25" "2004-05-04" "2003-11-07"
## [101] "2003-05-20" "2003-12-23" "2004-08-11" "2003-09-03" "2003-11-13"
## [106] "2003-11-10" "2003-02-11" "2003-07-16" "2003-09-04" "2003-02-09"
## [111] "2003-12-17" "2004-01-22" "2004-05-31" "2003-10-09" "2004-05-03"
## [116] "2004-04-15" "2004-03-03" "2004-06-03" "2003-12-12" "2004-05-13"
## [121] "2004-04-18" "2003-12-08" "2003-12-08" "2003-06-04" "2004-02-20"
## [126] "2003-12-01" "2003-09-03" "2004-02-06" "2003-10-03" "2004-04-13"
## [131] "2004-01-23" "2003-11-13" "2003-02-20" "2004-06-20" "2003-10-08"
## [136] "2003-07-20" "2004-02-17" "2003-04-12" "2003-12-13" "2003-12-13"
## [141] "2003-09-07" "2004-08-17" "2003-10-11" "2003-02-22" "2003-12-04"
## [146] "2003-01-01" "2004-01-27" "2004-02-09" "2004-06-26" "2004-02-10"
## [151] "2003-03-24" "2004-08-24" "2002-07-23" "2004-02-19" "2003-01-21"
## [156] "2004-07-21" "2004-03-21" "2004-07-13" "2004-04-14" "2004-01-30"
## [161] "2003-01-09" "2004-07-04" "2003-12-16" "2001-12-16" "2004-03-16"
## [166] "2003-04-24" "2004-02-20" "2001-12-30" "2004-03-25" "2002-01-21"
## [171] "2003-10-03" "2004-08-18" "2004-06-13" "2004-01-30" "2004-01-08"
## [176] "2002-09-27" "2003-11-21" "2004-01-28" "2003-09-04" "2003-08-09"
## [181] "2003-04-16" "2002-10-09" "2002-10-11" "2002-09-25" "2004-05-04"
## [186] "2003-11-07" "2003-05-20" "2003-12-23" "2004-08-11" "2003-09-03"
## [191] "2003-11-13" "2003-11-10" "2003-02-11" "2003-07-16" "2003-09-04"
## [196] "2003-02-09" "2003-12-17" "2004-01-22" "2004-05-31" "2003-10-09"
## [201] "2004-05-03" "2004-04-15" "2004-03-03" "2004-06-03" "2003-12-12"
## [206] "2004-05-13" "2004-04-18" "2003-12-08" "2003-12-08" "2003-06-04"
## [211] "2004-02-20" "2003-12-01" "2003-09-03" "2004-02-06" "2003-10-03"
## [216] "2004-04-13" "2004-01-23" "2003-11-13" "2003-02-20" "2004-06-20"
## [221] "2003-10-08" "2003-07-20" "2004-02-17" "2003-04-12" "2003-12-13"
## [226] "2003-12-13" "2003-09-07" "2004-08-17" "2003-10-11" "2003-02-22"
## [231] "2003-12-04" "2003-01-01" "2004-01-27" "2004-02-09" "2004-06-26"
## Economically_Disadvantaged_Code
## Economically_Disadvantaged_Code 1.000000000
## Approach_Grade_Level -0.009777657
## Meets_Grade_Level 0.077438958
## Masters_Grade_Level 0.002440700
## Scale_Score 0.023553779
## Raw_Score 0.024591255
## Approach_Grade_Level Meets_Grade_Level
## Economically_Disadvantaged_Code -0.009777657 0.07743896
## Approach_Grade_Level 1.000000000 0.56058331
## Meets_Grade_Level 0.560583310 1.00000000
## Masters_Grade_Level 0.342912427 0.61170645
## Scale_Score 0.730180396 0.80634135
## Raw_Score 0.792331412 0.84076600
## Masters_Grade_Level Scale_Score Raw_Score
## Economically_Disadvantaged_Code 0.0024407 0.02355378 0.02459125
## Approach_Grade_Level 0.3429124 0.73018040 0.79233141
## Meets_Grade_Level 0.6117065 0.80634135 0.84076600
## Masters_Grade_Level 1.0000000 0.76229364 0.72491225
## Scale_Score 0.7622936 1.00000000 0.98512316
## Raw_Score 0.7249123 0.98512316 1.00000000
Male and female students have similar median scale scores. However,
females show a slightly wider interquartile range, indicating greater
variability in their scores. Poor students tend to have lower median
scores compared to their non-disadvantaged peers, highlighting a
potential achievement gap
## [1] 49
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] 51
Meets Grade Level by Ethnicity: Most students in each ethnic category
did not meet the grade level, with Hispanic students having the highest
number of students both meeting and not meeting the grade level. This is
reflective of the predominant demographic in the data set.
Masters Grade Level by Gender: A higher proportion of female students appear to master the grade level compared to male students, suggesting better performance at the highest achievement level.
## `summarise()` has grouped output by 'Sex_Code'. You can override using the
## `.groups` argument.
##
## B H I
## F 10 103 0
## M 10 111 1
Hispanic make up over 90% of students in all the campuses in the Houston ISD, with male Hispanic students accounting for about 47% of the total population
# Filter data for students who have "Yes" (1) in all three levels
students_all_yes <- Combined[Combined$Approach_Grade_Level == 1 &
Combined$Meets_Grade_Level == 1 &
Combined$Masters_Grade_Level == 1, ]
# Count the number of students from each campus
campus_counts <- table(students_all_yes$D_CampusName)
# Convert campus counts to data frame
campus_counts_df <- data.frame(Campus = names(campus_counts),
Students_with_All_3_Levels = as.numeric(campus_counts))
print(view(campus_counts_df))
## Campus Students_with_All_3_Levels
## 1 Campus A 1
## 2 Campus B 36
## 3 Campus C 8
## 4 Campus D 6
# Create bar plot
print(ggplot(campus_counts_df,
aes(x = reorder(Campus, -Students_with_All_3_Levels),
y = Students_with_All_3_Levels, fill = Campus)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Students_with_All_3_Levels), vjust = -0.5, color = "black") +
labs(title = "Number of Students with 'Yes' in All Three Levels by Campus",
x = "Campus", y = "Number of Students") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))
# Calculate average Raw_Score by campus
average_scores <- aggregate(Raw_Score ~ D_CampusName, data = Combined, FUN = mean)
average_scores <- average_scores %>%
mutate(Raw_Score = round(Raw_Score, 2)) %>%
select(D_CampusName, Raw_Score) # Keep only the rounded numeric column and the categorical column
# View the modified data frame
print(view(average_scores))
## D_CampusName Raw_Score
## 1 Campus A 31.89
## 2 Campus B 29.65
## 3 Campus C 31.03
## 4 Campus D 26.75
# Create bar plot
# Define custom colors for each campus
campus_colors <- c("Campus A" = "blue", "Campus B" = "red", "Campus C" = "green", "Campus D" = "purple")
# Bar plot with distinct coloring for different campuses
print(ggplot(average_scores,
aes(x = reorder(D_CampusName, -Raw_Score),
y = Raw_Score, fill = D_CampusName)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Raw_Score)), vjust = -0.5, color = "black") +
scale_fill_manual(values = campus_colors) +
labs(title = "Average Raw Score by Campus",
x = "Campus", y = "Average Raw Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))