library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
#Read in files
arsenic <- read.csv("arsenic.csv", header = TRUE, stringsAsFactors = FALSE)
flouride <- read.csv("flouride.csv", header = TRUE, stringsAsFactors = FALSE)
#Rename Columns
names(arsenic) <- c("City", "Wells_Tested", "Percent_Above", "Median", "Percentile_95", "Maximum")
names(flouride) <- c("City", "Wells_Tested", "Percent_Above", "Median", "Percentile_95", "Maximum")

#Add in Contaminant column to display either arsenic or flouride
arsenic$Contaminant <- c("Arsenic")
flouride$Contaminant <- c("Flouride")
#Round all numbers to 1 decimal place in each data frame

#Arsenic
arsenic <- mutate(arsenic, Median = round(Median, digits = 1), Percentile_95 = round(Percentile_95, digits = 1), Maximum = round(Maximum, digits = 1))

#Flouride
flouride <- mutate(flouride, Median = round(Median, digits = 1), Percentile_95 = round(Percentile_95, digits = 1),Maximum = round(Maximum, digits = 1))
#Combine Data to create a single table
combined <- rbind(arsenic, flouride)
#Remove data where there are fewer than 20 wells tested
#aresenic2 and flouride2 are created to use in a future join
arsenic2 <- arsenic %>% filter(Wells_Tested >= 20 )
flouride2 <- flouride %>% filter(Wells_Tested >= 20 )

combined <- combined %>% filter(Wells_Tested >= 20 )
#My town's data
#Add in a column, Wells_Over, to display the number of wells that are above the guideline along with the %
combined <- mutate(combined,Wells_Over = round((Wells_Tested * (Percent_Above/100)), digits = 1))

output1 <- combined %>% filter(City == "Brunswick") %>% select(City, Contaminant, Wells_Tested, Wells_Over, Percent_Above, Maximum, Median)

My Town

I was curious as to what data looked like for the town that I live in. Having only 7.5% and 5.0% of wells test high for arsenic and flouride, respectively, it appears as though the water in my area is less contaminated that most other cities in Maine.
library(knitr)
kable(output1)
City Contaminant Wells_Tested Wells_Over Percent_Above Maximum Median
Brunswick Arsenic 255 19.1 7.5 250.0 0.5
Brunswick Flouride 299 15.0 5.0 5.9 0.1

Maximum Vs. % of Wells Above Guideline

Next, I wanted to see if there appeared to be any correlation between the maximum particle count and the percent of wells that tested high in that town. Is it possible that if there is a higher concentration of arsenic/flouride in one well then the rest of the water in the area has a high chance of being affected?
Neither plot shows a definite correlation, however you can see more of a pattern with flouride (graph 2).

Arsenic

#Plot maximum vs % above guideline to see if more wells test high when the maximum is higher

library(ggvis)
#plot arsenic
plot1 <-  ggvis(arsenic2, x = ~Maximum, y = ~Percent_Above)
layer_points(plot1)

Flouride

#plot flouride
plot2 <- ggvis(flouride2, x = ~Maximum, y = ~Percent_Above) 
layer_points(plot2)
#Create a data frame that contains all wells above the guideline
highcities <- combined %>% filter(Percent_Above > 0)  %>% arrange(desc(Percent_Above))

#Show top 10 highest % above, note flouride is not in the top 10
output2 <- highcities %>%  select(City, Contaminant, Percent_Above) %>% arrange(desc(Percent_Above)) %>% top_n(10)
## Selecting by Percent_Above
#Checks to see what flouride's top 10 above the guideline is
output3 <- highcities %>% filter(Contaminant == "Flouride") %>% select(City, Contaminant, Percent_Above) %>% arrange(desc(Percent_Above)) %>% top_n(10)
## Selecting by Percent_Above

Towns with the Highest % of Wells Above the Guideline

The code that I wrote for the first table below, was designed to display the towns with the highest percent of wells above the guideline regardless of contaminant. It just so happened that arsenic caused more wells to test high than flouride. Since arsenic dominated the top 10 in the first chart, I went ahead and displayed the top 10 towns that tested high for flouride. None of the towns overlap between the two.

Top 10 Highest (all Arsenic)

kable(output2)
City Contaminant Percent_Above
Manchester Arsenic 58.9
Gorham Arsenic 50.1
Columbia Arsenic 50.0
Monmouth Arsenic 49.5
Eliot Arsenic 49.3
Columbia Falls Arsenic 48.0
Winthrop Arsenic 44.8
Hallowell Arsenic 44.6
Buxton Arsenic 43.4
Blue Hill Arsenic 42.7

Top 10 Highest for Flouride

kable(output3)
City Contaminant Percent_Above
Otis Flouride 30.0
Dedham Flouride 22.5
Denmark Flouride 19.6
Surry Flouride 18.3
Prospect Flouride 17.5
Eastbrook Flouride 16.1
Mercer Flouride 15.6
Fryeburg Flouride 15.4
Brownfield Flouride 15.2
Stockton Springs Flouride 14.3
#Create join between arsenic2 and flouride2
joined <- inner_join(arsenic2, flouride2, by = c("City" = "City"))

#Insert new calculated column that finds the average % above the guideline between both arsenic and flouride for each city
joined <- mutate(joined, Avg_Percent_Above = (Percent_Above.x + Percent_Above.y)/2)

#Output the top 10 cities with the highest average % above
output4 <- joined %>% select(City, Avg_Percent_Above) %>% arrange(desc(Avg_Percent_Above)) %>% top_n(10)
## Selecting by Avg_Percent_Above

Top 10 Towns with Highest Average %

Finally, I joined my two tables, arsenic and flouride, and added a caluclated column that would display the average percent of wells that tested high for each town. I wanted to see what towns had the highest average.
kable(output4)
City Avg_Percent_Above
Otis 34.80
Manchester 31.10
Surry 29.30
Monmouth 26.30
Blue Hill 26.15
Mercer 26.00
Columbia 25.95
Gorham 25.05
Orland 24.65
Eliot 24.65