knitr::opts_chunk$set(echo = TRUE)

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Meaningful question for analysis: Do students do better in Math or Reading?
# This data set contains Test score information for counties in California. My conclusion is that the average test score is 650 across both Reading and Math. Students do equally well in both subjects.

# Read the data file CASchools.csv from GitHub
CalTestScoreData = read.table(file="https://raw.githubusercontent.com/BeshkiaKvarnstrom/MSDS-Repos/main/CASchools.csv", header=TRUE,sep=",")
# Question 1. Data Exploration: This should include summary statistics, means, medians, quartiles, or any
# other relevant information about the data set. Please include some conclusions in the R Markdown text.

summary(CalTestScoreData)
##        X            district        school             county         
##  Min.   :  1.0   Min.   :61382   Length:420         Length:420        
##  1st Qu.:105.8   1st Qu.:64308   Class :character   Class :character  
##  Median :210.5   Median :67761   Mode  :character   Mode  :character  
##  Mean   :210.5   Mean   :67473                                        
##  3rd Qu.:315.2   3rd Qu.:70419                                        
##  Max.   :420.0   Max.   :75440                                        
##     grades             students          teachers          calworks     
##  Length:420         Min.   :   81.0   Min.   :   4.85   Min.   : 0.000  
##  Class :character   1st Qu.:  379.0   1st Qu.:  19.66   1st Qu.: 4.395  
##  Mode  :character   Median :  950.5   Median :  48.56   Median :10.520  
##                     Mean   : 2628.8   Mean   : 129.07   Mean   :13.246  
##                     3rd Qu.: 3008.0   3rd Qu.: 146.35   3rd Qu.:18.981  
##                     Max.   :27176.0   Max.   :1429.00   Max.   :78.994  
##      lunch           computer       expenditure       income      
##  Min.   :  0.00   Min.   :   0.0   Min.   :3926   Min.   : 5.335  
##  1st Qu.: 23.28   1st Qu.:  46.0   1st Qu.:4906   1st Qu.:10.639  
##  Median : 41.75   Median : 117.5   Median :5215   Median :13.728  
##  Mean   : 44.71   Mean   : 303.4   Mean   :5312   Mean   :15.317  
##  3rd Qu.: 66.86   3rd Qu.: 375.2   3rd Qu.:5601   3rd Qu.:17.629  
##  Max.   :100.00   Max.   :3324.0   Max.   :7712   Max.   :55.328  
##     english            read            math      
##  Min.   : 0.000   Min.   :604.5   Min.   :605.4  
##  1st Qu.: 1.941   1st Qu.:640.4   1st Qu.:639.4  
##  Median : 8.778   Median :655.8   Median :652.5  
##  Mean   :15.768   Mean   :655.0   Mean   :653.3  
##  3rd Qu.:22.970   3rd Qu.:668.7   3rd Qu.:665.9  
##  Max.   :85.540   Max.   :704.0   Max.   :709.5
# Showing use of the unique()function on a dataframe to remove duplicates test scores by county
UniqueTestScores <- unique(data.frame(CalTestScoreData$county, CalTestScoreData$grades))
colnames(UniqueTestScores) <- c("COUNTY", "COUNTY GRADES")
UniqueTestScores
##              COUNTY COUNTY GRADES
## 1           Alameda         KK-08
## 2             Butte         KK-08
## 6            Fresno         KK-08
## 7       San Joaquin         KK-08
## 8              Kern         KK-08
## 10       Sacramento         KK-06
## 11           Merced         KK-08
## 13           Tulare         KK-08
## 19      Los Angeles         KK-08
## 21         Imperial         KK-08
## 27         Monterey         KK-06
## 31        San Diego         KK-08
## 32         Monterey         KK-08
## 35   San Bernardino         KK-08
## 39        San Mateo         KK-08
## 41          Ventura         KK-08
## 45        Riverside         KK-06
## 47      Santa Clara         KK-08
## 52           Madera         KK-08
## 54    Santa Barbara         KK-08
## 63           Orange         KK-06
## 64            Kings         KK-08
## 69        San Diego         KK-06
## 70           Sonoma         KK-06
## 77     Contra Costa         KK-08
## 78         Humboldt         KK-08
## 79         Siskiyou         KK-08
## 91       Sacramento         KK-08
## 92             Lake         KK-08
## 103          Sutter         KK-08
## 104       Mendocino         KK-08
## 106      San Benito         KK-08
## 119          Shasta         KK-08
## 121          Tehama         KK-08
## 126       Riverside         KK-08
## 133      Stanislaus         KK-08
## 134        Tuolumne         KK-08
## 136       El Dorado         KK-08
## 156          Orange         KK-08
## 169  San Bernardino         KK-06
## 176        Humboldt         KK-06
## 179          Placer         KK-08
## 180          Sonoma         KK-08
## 186           Glenn         KK-08
## 188          Lassen         KK-08
## 199      Santa Cruz         KK-08
## 209          Nevada         KK-08
## 218        Siskiyou         KK-06
## 233       Calaveras         KK-08
## 234           Marin         KK-08
## 248 San Luis Obispo         KK-06
## 252            Inyo         KK-08
## 276     Los Angeles         KK-06
## 291      Santa Cruz         KK-06
## 299   Santa Barbara         KK-06
## 354 San Luis Obispo         KK-08
## 367         Trinity         KK-08
## 378     Santa Clara         KK-06
## 419            Yuba         KK-08
# Quantile of CalTestScoreData - Finding and returning the percentiles of the math and reading scores
sapply(data.frame(CalTestScoreData$math, CalTestScoreData$read), function(x) round(quantile(x, probs = seq(0, 1, 1/4))))
##      CalTestScoreData.math CalTestScoreData.read
## 0%                     605                   604
## 25%                    639                   640
## 50%                    652                   656
## 75%                    666                   669
## 100%                   710                   704
# Display the mean and median Reading and Math scores.
MeanTestScores <- round(sapply(CalTestScoreData[, c("math", "read")], mean),2)
MedianTestScores <- round(sapply(CalTestScoreData[, c("math", "read")], median),2)
TestScoreStats <- data.frame(rbind(MeanTestScores, MedianTestScores))
colnames(TestScoreStats) <- c("MATHEMATICS", "READING")
print(TestScoreStats)
##                  MATHEMATICS READING
## MeanTestScores        653.34  654.97
## MedianTestScores      652.45  655.75
# Question 2. Data wrangling: Please perform some basic transformations. They will need to make sense but
# could include column renaming, creating a subset of the data, replacing values, or creating new
# columns with derived data (for example – if it makes sense you could sum two columns together)

# creating a subset of county with math grades > 650 and finding the minimum and maximum math grade for each county
TestScore_sub <- subset(CalTestScoreData, CalTestScoreData$math > 650)
TestScore_sub <- TestScore_sub[, c("county", "school",  "grades", "math", "read")]

Agg_TestScores <- TestScore_sub %>% group_by(school) %>% group_by(county, grades) %>%
            summarise(
                      Min_MathGrade = min(math), 
                      max_MathGrade = max(math),
                      Min_ReadGrade = min(read), 
                      max_ReadGrade = max(read),
                     .groups = 'drop'
                    )
colnames(Agg_TestScores) <- c("COUNTY", "GRADE", "MIN MATH SCORE", "MAX MATH SCORE", "MIN READING SCORE", "MAX READING SCORE")
Agg_TestScores
## # A tibble: 54 × 6
##    COUNTY       GRADE `MIN MATH SCORE` `MAX MATH SCORE` MIN READING SC…¹ MAX R…²
##    <chr>        <chr>            <dbl>            <dbl>            <dbl>   <dbl>
##  1 Alameda      KK-08             690              690              692.    692.
##  2 Butte        KK-08             651.             662.             636.    660.
##  3 Calaveras    KK-08             650.             650.             663.    663.
##  4 Contra Costa KK-08             653.             697.             657.    699.
##  5 El Dorado    KK-08             656.             677.             662.    678.
##  6 Fresno       KK-08             660.             676.             650     669.
##  7 Glenn        KK-08             660.             662.             668.    672.
##  8 Humboldt     KK-06             663.             667.             660.    672.
##  9 Humboldt     KK-08             652.             673.             656.    683 
## 10 Imperial     KK-08             654.             656.             648.    653.
## # … with 44 more rows, and abbreviated variable names ¹​`MIN READING SCORE`,
## #   ²​`MAX READING SCORE`
# renaming columns
RenameScoreData <- CalTestScoreData
names(RenameScoreData)[names(RenameScoreData) == 'x'] <- 'Record Count'
names(RenameScoreData)[names(RenameScoreData) == 'math'] <- 'Mathematics'
names(RenameScoreData)[names(RenameScoreData) == 'read'] <- 'Reading'
names(RenameScoreData)[names(RenameScoreData) == 'teachers'] <- 'Educators'

colnames(RenameScoreData)
##  [1] "X"           "district"    "school"      "county"      "grades"     
##  [6] "students"    "Educators"   "calworks"    "lunch"       "computer"   
## [11] "expenditure" "income"      "english"     "Reading"     "Mathematics"
#Box plot to show Math Scores for eack grade
MathBxPlt <- CalTestScoreData

MathBxPlt %>% 
  ggplot(aes(x=math, y= grades))+
  geom_boxplot(na.rm = TRUE)+
  labs(title = "Math Score by Grade", x= "Math Score", y = "Grades")+
  theme_classic()

# Question 3. Graphics: Please make sure to display at least one scatter plot, box plot and histogram. 
# Don’t be limited to this. Please explore the many other options in R packages such as ggplot2.

# The histogram below displays the Math Score ranges and the total number of schools within each range
MathScoreHis <- hist(CalTestScoreData$math, main="Math Scores Across All Schools", ylab = "Total Schools", xlab="Math Score", ylim=c(0,200), col="purple", breaks = 5)

text(MathScoreHis$mids,MathScoreHis$counts ,labels=MathScoreHis$counts, adj=c(0.5, -0.5))

# Question 3. 
# The histogram below displays the Read Score ranges and the total number of schools within each range using ggplot2 

TestScoreHisgg <- CalTestScoreData
TestScoreHisgg %>% 
  ggplot(aes(read))+
  geom_histogram( na.rm = TRUE, bins= 6, fill = "purple", color = "purple", alpha = 0.4 )+
  labs(title = "Read Scrores across all schools", x= "Read Score", y = "Total Schools")+
  theme_classic() 

# Creating a histogram utilizing ggplot2 comparing Math scores across grades
CompHisgg <- CalTestScoreData
CompHisgg %>% 
  ggplot(aes(`math`))+
  geom_histogram(na.rm = TRUE, bins= 6, fill = "purple", color = "purple", alpha = 0.4 )+
  labs(title = "Math Test Score by Grade Description", x= "Math Score", y = "Score Count")+
  theme_classic() + facet_wrap(~grades)

#Scatter Plot to Math grades for each Grade

ScattPlt <- CalTestScoreData
ScatterPlot <- ggplot(ScattPlt,aes(x=grades, y=math, color = grades)) + geom_point(size=3)
ScatterPlot