knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Meaningful question for analysis: Do students do better in Math or Reading?
# This data set contains Test score information for counties in California. My conclusion is that the average test score is 650 across both Reading and Math. Students do equally well in both subjects.
# Read the data file CASchools.csv from GitHub
CalTestScoreData = read.table(file="https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-Repos/main/CASchools.csv", header=TRUE,sep=",")
# Question 1. Data Exploration: This should include summary statistics, means, medians, quartiles, or any
# other relevant information about the data set. Please include some conclusions in the R Markdown text.
summary(CalTestScoreData)
## X district school county
## Min. : 1.0 Min. :61382 Length:420 Length:420
## 1st Qu.:105.8 1st Qu.:64308 Class :character Class :character
## Median :210.5 Median :67761 Mode :character Mode :character
## Mean :210.5 Mean :67473
## 3rd Qu.:315.2 3rd Qu.:70419
## Max. :420.0 Max. :75440
## grades students teachers calworks
## Length:420 Min. : 81.0 Min. : 4.85 Min. : 0.000
## Class :character 1st Qu.: 379.0 1st Qu.: 19.66 1st Qu.: 4.395
## Mode :character Median : 950.5 Median : 48.56 Median :10.520
## Mean : 2628.8 Mean : 129.07 Mean :13.246
## 3rd Qu.: 3008.0 3rd Qu.: 146.35 3rd Qu.:18.981
## Max. :27176.0 Max. :1429.00 Max. :78.994
## lunch computer expenditure income
## Min. : 0.00 Min. : 0.0 Min. :3926 Min. : 5.335
## 1st Qu.: 23.28 1st Qu.: 46.0 1st Qu.:4906 1st Qu.:10.639
## Median : 41.75 Median : 117.5 Median :5215 Median :13.728
## Mean : 44.71 Mean : 303.4 Mean :5312 Mean :15.317
## 3rd Qu.: 66.86 3rd Qu.: 375.2 3rd Qu.:5601 3rd Qu.:17.629
## Max. :100.00 Max. :3324.0 Max. :7712 Max. :55.328
## english read math
## Min. : 0.000 Min. :604.5 Min. :605.4
## 1st Qu.: 1.941 1st Qu.:640.4 1st Qu.:639.4
## Median : 8.778 Median :655.8 Median :652.5
## Mean :15.768 Mean :655.0 Mean :653.3
## 3rd Qu.:22.970 3rd Qu.:668.7 3rd Qu.:665.9
## Max. :85.540 Max. :704.0 Max. :709.5
# Showing use of the unique()function on a dataframe to remove duplicates test scores by county
UniqueTestScores <- unique(data.frame(CalTestScoreData$county, CalTestScoreData$grades))
colnames(UniqueTestScores) <- c("COUNTY", "COUNTY GRADES")
UniqueTestScores
## COUNTY COUNTY GRADES
## 1 Alameda KK-08
## 2 Butte KK-08
## 6 Fresno KK-08
## 7 San Joaquin KK-08
## 8 Kern KK-08
## 10 Sacramento KK-06
## 11 Merced KK-08
## 13 Tulare KK-08
## 19 Los Angeles KK-08
## 21 Imperial KK-08
## 27 Monterey KK-06
## 31 San Diego KK-08
## 32 Monterey KK-08
## 35 San Bernardino KK-08
## 39 San Mateo KK-08
## 41 Ventura KK-08
## 45 Riverside KK-06
## 47 Santa Clara KK-08
## 52 Madera KK-08
## 54 Santa Barbara KK-08
## 63 Orange KK-06
## 64 Kings KK-08
## 69 San Diego KK-06
## 70 Sonoma KK-06
## 77 Contra Costa KK-08
## 78 Humboldt KK-08
## 79 Siskiyou KK-08
## 91 Sacramento KK-08
## 92 Lake KK-08
## 103 Sutter KK-08
## 104 Mendocino KK-08
## 106 San Benito KK-08
## 119 Shasta KK-08
## 121 Tehama KK-08
## 126 Riverside KK-08
## 133 Stanislaus KK-08
## 134 Tuolumne KK-08
## 136 El Dorado KK-08
## 156 Orange KK-08
## 169 San Bernardino KK-06
## 176 Humboldt KK-06
## 179 Placer KK-08
## 180 Sonoma KK-08
## 186 Glenn KK-08
## 188 Lassen KK-08
## 199 Santa Cruz KK-08
## 209 Nevada KK-08
## 218 Siskiyou KK-06
## 233 Calaveras KK-08
## 234 Marin KK-08
## 248 San Luis Obispo KK-06
## 252 Inyo KK-08
## 276 Los Angeles KK-06
## 291 Santa Cruz KK-06
## 299 Santa Barbara KK-06
## 354 San Luis Obispo KK-08
## 367 Trinity KK-08
## 378 Santa Clara KK-06
## 419 Yuba KK-08
# Quantile of CalTestScoreData - Finding and returning the percentiles of the math and reading scores
sapply(data.frame(CalTestScoreData$math, CalTestScoreData$read), function(x) round(quantile(x, probs = seq(0, 1, 1/4))))
## CalTestScoreData.math CalTestScoreData.read
## 0% 605 604
## 25% 639 640
## 50% 652 656
## 75% 666 669
## 100% 710 704
# Display the mean and median Reading and Math scores.
MeanTestScores <- round(sapply(CalTestScoreData[, c("math", "read")], mean),2)
MedianTestScores <- round(sapply(CalTestScoreData[, c("math", "read")], median),2)
TestScoreStats <- data.frame(rbind(MeanTestScores, MedianTestScores))
colnames(TestScoreStats) <- c("MATHEMATICS", "READING")
print(TestScoreStats)
## MATHEMATICS READING
## MeanTestScores 653.34 654.97
## MedianTestScores 652.45 655.75
# Question 2. Data wrangling: Please perform some basic transformations. They will need to make sense but
# could include column renaming, creating a subset of the data, replacing values, or creating new
# columns with derived data (for example – if it makes sense you could sum two columns together)
# creating a subset of county with math grades > 650 and finding the minimum and maximum math grade for each county
TestScore_sub <- subset(CalTestScoreData, CalTestScoreData$math > 650)
TestScore_sub <- TestScore_sub[, c("county", "school", "grades", "math", "read")]
Agg_TestScores <- TestScore_sub %>% group_by(school) %>% group_by(county, grades) %>%
summarise(
Min_MathGrade = min(math),
max_MathGrade = max(math),
Min_ReadGrade = min(read),
max_ReadGrade = max(read),
.groups = 'drop'
)
colnames(Agg_TestScores) <- c("COUNTY", "GRADE", "MIN MATH SCORE", "MAX MATH SCORE", "MIN READING SCORE", "MAX READING SCORE")
Agg_TestScores
## # A tibble: 54 × 6
## COUNTY GRADE `MIN MATH SCORE` `MAX MATH SCORE` MIN READING SC…¹ MAX R…²
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Alameda KK-08 690 690 692. 692.
## 2 Butte KK-08 651. 662. 636. 660.
## 3 Calaveras KK-08 650. 650. 663. 663.
## 4 Contra Costa KK-08 653. 697. 657. 699.
## 5 El Dorado KK-08 656. 677. 662. 678.
## 6 Fresno KK-08 660. 676. 650 669.
## 7 Glenn KK-08 660. 662. 668. 672.
## 8 Humboldt KK-06 663. 667. 660. 672.
## 9 Humboldt KK-08 652. 673. 656. 683
## 10 Imperial KK-08 654. 656. 648. 653.
## # … with 44 more rows, and abbreviated variable names ¹​`MIN READING SCORE`,
## # ²​`MAX READING SCORE`
# renaming columns
RenameScoreData <- CalTestScoreData
names(RenameScoreData)[names(RenameScoreData) == 'x'] <- 'Record Count'
names(RenameScoreData)[names(RenameScoreData) == 'math'] <- 'Mathematics'
names(RenameScoreData)[names(RenameScoreData) == 'read'] <- 'Reading'
names(RenameScoreData)[names(RenameScoreData) == 'teachers'] <- 'Educators'
colnames(RenameScoreData)
## [1] "X" "district" "school" "county" "grades"
## [6] "students" "Educators" "calworks" "lunch" "computer"
## [11] "expenditure" "income" "english" "Reading" "Mathematics"
#Box plot to show Math Scores for eack grade
MathBxPlt <- CalTestScoreData
MathBxPlt %>%
ggplot(aes(x=math, y= grades))+
geom_boxplot(na.rm = TRUE)+
labs(title = "Math Score by Grade", x= "Math Score", y = "Grades")+
theme_classic()

# Question 3. Graphics: Please make sure to display at least one scatter plot, box plot and histogram.
# Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.
# The histogram below displays the Math Score ranges and the total number of schools within each range
MathScoreHis <- hist(CalTestScoreData$math, main="Math Scores Across All Schools", ylab = "Total Schools", xlab="Math Score", ylim=c(0,200), col="purple", breaks = 5)
text(MathScoreHis$mids,MathScoreHis$counts ,labels=MathScoreHis$counts, adj=c(0.5, -0.5))

# Question 3.
# The histogram below displays the Read Score ranges and the total number of schools within each range using ggplot2
TestScoreHisgg <- CalTestScoreData
TestScoreHisgg %>%
ggplot(aes(read))+
geom_histogram( na.rm = TRUE, bins= 6, fill = "purple", color = "purple", alpha = 0.4 )+
labs(title = "Read Scrores across all schools", x= "Read Score", y = "Total Schools")+
theme_classic()

# Creating a histogram utilizing ggplot2 comparing Math scores across grades
CompHisgg <- CalTestScoreData
CompHisgg %>%
ggplot(aes(`math`))+
geom_histogram(na.rm = TRUE, bins= 6, fill = "purple", color = "purple", alpha = 0.4 )+
labs(title = "Math Test Score by Grade Description", x= "Math Score", y = "Score Count")+
theme_classic() + facet_wrap(~grades)

#Scatter Plot to Math grades for each Grade
ScattPlt <- CalTestScoreData
ScatterPlot <- ggplot(ScattPlt,aes(x=grades, y=math, color = grades)) + geom_point(size=3)
ScatterPlot
