library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
#Read in files
arsenic <- read.csv("arsenic.csv", header = TRUE, stringsAsFactors = FALSE)
flouride <- read.csv("flouride.csv", header = TRUE, stringsAsFactors = FALSE)
#Rename Columns
names(arsenic) <- c("City", "Wells_Tested", "Percent_Above", "Median", "Percentile_95", "Maximum")
names(flouride) <- c("City", "Wells_Tested", "Percent_Above", "Median", "Percentile_95", "Maximum")
#Add in Contaminant column to display either arsenic or flouride
arsenic$Contaminant <- c("Arsenic")
flouride$Contaminant <- c("Flouride")
#Round all numbers to 1 decimal place in each data frame
#Arsenic
arsenic <- mutate(arsenic, Median = round(Median, digits = 1), Percentile_95 = round(Percentile_95, digits = 1), Maximum = round(Maximum, digits = 1))
#Flouride
flouride <- mutate(flouride, Median = round(Median, digits = 1), Percentile_95 = round(Percentile_95, digits = 1),Maximum = round(Maximum, digits = 1))
#Combine Data to create a single table
combined <- rbind(arsenic, flouride)
#Remove data where there are fewer than 20 wells tested
#aresenic2 and flouride2 are created to use in a future join
arsenic2 <- arsenic %>% filter(Wells_Tested >= 20 )
flouride2 <- flouride %>% filter(Wells_Tested >= 20 )
combined <- combined %>% filter(Wells_Tested >= 20 )
#My town's data
#Add in a column, Wells_Over, to display the number of wells that are above the guideline along with the %
combined <- mutate(combined,Wells_Over = round((Wells_Tested * (Percent_Above/100)), digits = 1))
output1 <- combined %>% filter(City == "Brunswick") %>% select(City, Contaminant, Wells_Tested, Wells_Over, Percent_Above, Maximum, Median)
My Town
I was curious as to what data looked like for the town that I live in. Having only 7.5% and 5.0% of wells test high for arsenic and flouride, respectively, it appears as though the water in my area is less contaminated that most other cities in Maine.
library(knitr)
kable(output1)
Brunswick |
Arsenic |
255 |
19.1 |
7.5 |
250.0 |
0.5 |
Brunswick |
Flouride |
299 |
15.0 |
5.0 |
5.9 |
0.1 |
Maximum Vs. % of Wells Above Guideline
Next, I wanted to see if there appeared to be any correlation between the maximum particle count and the percent of wells that tested high in that town. Is it possible that if there is a higher concentration of arsenic/flouride in one well then the rest of the water in the area has a high chance of being affected?
Neither plot shows a definite correlation, however you can see more of a pattern with flouride (graph 2).
Arsenic
#Plot maximum vs % above guideline to see if more wells test high when the maximum is higher
library(ggvis)
#plot arsenic
plot1 <- ggvis(arsenic2, x = ~Maximum, y = ~Percent_Above)
layer_points(plot1)
Flouride
#plot flouride
plot2 <- ggvis(flouride2, x = ~Maximum, y = ~Percent_Above)
layer_points(plot2)
#Create a data frame that contains all wells above the guideline
highcities <- combined %>% filter(Percent_Above > 0) %>% arrange(desc(Percent_Above))
#Show top 10 highest % above, note flouride is not in the top 10
output2 <- highcities %>% select(City, Contaminant, Percent_Above) %>% arrange(desc(Percent_Above)) %>% top_n(10)
## Selecting by Percent_Above
#Checks to see what flouride's top 10 above the guideline is
output3 <- highcities %>% filter(Contaminant == "Flouride") %>% select(City, Contaminant, Percent_Above) %>% arrange(desc(Percent_Above)) %>% top_n(10)
## Selecting by Percent_Above
Towns with the Highest % of Wells Above the Guideline
The code that I wrote for the first table below, was designed to display the towns with the highest percent of wells above the guideline regardless of contaminant. It just so happened that arsenic caused more wells to test high than flouride. Since arsenic dominated the top 10 in the first chart, I went ahead and displayed the top 10 towns that tested high for flouride. None of the towns overlap between the two.
Top 10 Highest (all Arsenic)
kable(output2)
Manchester |
Arsenic |
58.9 |
Gorham |
Arsenic |
50.1 |
Columbia |
Arsenic |
50.0 |
Monmouth |
Arsenic |
49.5 |
Eliot |
Arsenic |
49.3 |
Columbia Falls |
Arsenic |
48.0 |
Winthrop |
Arsenic |
44.8 |
Hallowell |
Arsenic |
44.6 |
Buxton |
Arsenic |
43.4 |
Blue Hill |
Arsenic |
42.7 |
Top 10 Highest for Flouride
kable(output3)
Otis |
Flouride |
30.0 |
Dedham |
Flouride |
22.5 |
Denmark |
Flouride |
19.6 |
Surry |
Flouride |
18.3 |
Prospect |
Flouride |
17.5 |
Eastbrook |
Flouride |
16.1 |
Mercer |
Flouride |
15.6 |
Fryeburg |
Flouride |
15.4 |
Brownfield |
Flouride |
15.2 |
Stockton Springs |
Flouride |
14.3 |
#Create join between arsenic2 and flouride2
joined <- inner_join(arsenic2, flouride2, by = c("City" = "City"))
#Insert new calculated column that finds the average % above the guideline between both arsenic and flouride for each city
joined <- mutate(joined, Avg_Percent_Above = (Percent_Above.x + Percent_Above.y)/2)
#Output the top 10 cities with the highest average % above
output4 <- joined %>% select(City, Avg_Percent_Above) %>% arrange(desc(Avg_Percent_Above)) %>% top_n(10)
## Selecting by Avg_Percent_Above
Top 10 Towns with Highest Average %
Finally, I joined my two tables, arsenic and flouride, and added a caluclated column that would display the average percent of wells that tested high for each town. I wanted to see what towns had the highest average.
kable(output4)
Otis |
34.80 |
Manchester |
31.10 |
Surry |
29.30 |
Monmouth |
26.30 |
Blue Hill |
26.15 |
Mercer |
26.00 |
Columbia |
25.95 |
Gorham |
25.05 |
Orland |
24.65 |
Eliot |
24.65 |