library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

#Read in files
arsenic <- read.csv("arsenic.csv", header = TRUE, stringsAsFactors = FALSE)
flouride <- read.csv("flouride.csv", header = TRUE, stringsAsFactors = FALSE)

#Rename Columns
names(arsenic) <- c("City", "Wells_Tested", "Percent_Above", "Median", "Percentile_95", "Maximum")
names(flouride) <- c("City", "Wells_Tested", "Percent_Above", "Median", "Percentile_95", "Maximum")

#Add in Contaminant column to display either arsenic or flouride
arsenic$Contaminant <- c("Arsenic")
flouride$Contaminant <- c("Flouride")

#Round all numbers to 1 decimal place in each data frame

#Arsenic
arsenic <- mutate(arsenic, Median = round(Median, digits = 1), Percentile_95 = round(Percentile_95, digits = 1), Maximum = round(Maximum, digits = 1))

#Flouride
flouride <- mutate(flouride, Median = round(Median, digits = 1), Percentile_95 = round(Percentile_95, digits = 1),Maximum = round(Maximum, digits = 1))

#Combine Data to create a single table
combined <- rbind(arsenic, flouride)

#Remove data where there are fewer than 20 wells tested
#aresenic2 and flouride2 are created to use in a future join
arsenic2 <- arsenic %>% filter(Wells_Tested >= 20 )
flouride2 <- flouride %>% filter(Wells_Tested >= 20 )

combined <- combined %>% filter(Wells_Tested >= 20 )

#My town's data
#Add in a column, Wells_Over, to display the number of wells that are above the guideline along with the %
combined <- mutate(combined,Wells_Over = round((Wells_Tested * (Percent_Above/100)), digits = 1))

output1 <- combined %>% filter(City == "Brunswick") %>% select(City, Contaminant, Wells_Tested, Wells_Over, Percent_Above, Maximum, Median)

My Town

I was curious as to what data looked like for the town that I live in. Having only 7.5% and 5.0% of wells test high for arsenic and flouride, respectively, it appears as though the water in my area is less contaminated that most other cities in Maine.

library(knitr)
kable(output1)

City	Contaminant	Wells_Tested	Wells_Over	Percent_Above	Maximum	Median
Brunswick	Arsenic	255	19.1	7.5	250.0	0.5
Brunswick	Flouride	299	15.0	5.0	5.9	0.1

Maximum Vs. % of Wells Above Guideline

Next, I wanted to see if there appeared to be any correlation between the maximum particle count and the percent of wells that tested high in that town. Is it possible that if there is a higher concentration of arsenic/flouride in one well then the rest of the water in the area has a high chance of being affected?

Neither plot shows a definite correlation, however you can see more of a pattern with flouride (graph 2).

Arsenic

#Plot maximum vs % above guideline to see if more wells test high when the maximum is higher

library(ggvis)
#plot arsenic
plot1 <-  ggvis(arsenic2, x = ~Maximum, y = ~Percent_Above)
layer_points(plot1)

Flouride

#plot flouride
plot2 <- ggvis(flouride2, x = ~Maximum, y = ~Percent_Above) 
layer_points(plot2)

#Create a data frame that contains all wells above the guideline
highcities <- combined %>% filter(Percent_Above > 0)  %>% arrange(desc(Percent_Above))

#Show top 10 highest % above, note flouride is not in the top 10
output2 <- highcities %>%  select(City, Contaminant, Percent_Above) %>% arrange(desc(Percent_Above)) %>% top_n(10)

## Selecting by Percent_Above

#Checks to see what flouride's top 10 above the guideline is
output3 <- highcities %>% filter(Contaminant == "Flouride") %>% select(City, Contaminant, Percent_Above) %>% arrange(desc(Percent_Above)) %>% top_n(10)

## Selecting by Percent_Above

Towns with the Highest % of Wells Above the Guideline

The code that I wrote for the first table below, was designed to display the towns with the highest percent of wells above the guideline regardless of contaminant. It just so happened that arsenic caused more wells to test high than flouride. Since arsenic dominated the top 10 in the first chart, I went ahead and displayed the top 10 towns that tested high for flouride. None of the towns overlap between the two.

Top 10 Highest (all Arsenic)

kable(output2)

City	Contaminant	Percent_Above
Manchester	Arsenic	58.9
Gorham	Arsenic	50.1
Columbia	Arsenic	50.0
Monmouth	Arsenic	49.5
Eliot	Arsenic	49.3
Columbia Falls	Arsenic	48.0
Winthrop	Arsenic	44.8
Hallowell	Arsenic	44.6
Buxton	Arsenic	43.4
Blue Hill	Arsenic	42.7

Top 10 Highest for Flouride

kable(output3)

City	Contaminant	Percent_Above
Otis	Flouride	30.0
Dedham	Flouride	22.5
Denmark	Flouride	19.6
Surry	Flouride	18.3
Prospect	Flouride	17.5
Eastbrook	Flouride	16.1
Mercer	Flouride	15.6
Fryeburg	Flouride	15.4
Brownfield	Flouride	15.2
Stockton Springs	Flouride	14.3

#Create join between arsenic2 and flouride2
joined <- inner_join(arsenic2, flouride2, by = c("City" = "City"))

#Insert new calculated column that finds the average % above the guideline between both arsenic and flouride for each city
joined <- mutate(joined, Avg_Percent_Above = (Percent_Above.x + Percent_Above.y)/2)

#Output the top 10 cities with the highest average % above
output4 <- joined %>% select(City, Avg_Percent_Above) %>% arrange(desc(Avg_Percent_Above)) %>% top_n(10)

## Selecting by Avg_Percent_Above

Top 10 Towns with Highest Average %

Finally, I joined my two tables, arsenic and flouride, and added a caluclated column that would display the average percent of wells that tested high for each town. I wanted to see what towns had the highest average.

kable(output4)

City	Avg_Percent_Above
Otis	34.80
Manchester	31.10
Surry	29.30
Monmouth	26.30
Blue Hill	26.15
Mercer	26.00
Columbia	25.95
Gorham	25.05
Orland	24.65
Eliot	24.65

Chase - Assignment 1

Ashley E Chase

September 24, 2016

My Town

I was curious as to what data looked like for the town that I live in. Having only 7.5% and 5.0% of wells test high for arsenic and flouride, respectively, it appears as though the water in my area is less contaminated that most other cities in Maine.

Maximum Vs. % of Wells Above Guideline

Neither plot shows a definite correlation, however you can see more of a pattern with flouride (graph 2).

Arsenic

Flouride

Towns with the Highest % of Wells Above the Guideline

Top 10 Highest (all Arsenic)

Top 10 Highest for Flouride

Top 10 Towns with Highest Average %

Finally, I joined my two tables, arsenic and flouride, and added a caluclated column that would display the average percent of wells that tested high for each town. I wanted to see what towns had the highest average.