The State of Maine Health and Environmental Testing Laboratory provided these data. The table was prepared by the Maine Environmental Public Health Tracking Program. The complete data set contains water test results from 46,855 private wells in Maine. Revision Date: 08/2015.
The fields included in both csv files include:
maximum - the maximum readings in mg/L or ug/L Prepare a report that has an interesting narrative that focuses on a subset of the data you find interesting that includes both arsenic and flouride data. Your report should be uploaded to RPubs and you should post a link to your RPubs report in Piazza. You are required to join the data. It is up to you to determine how to handle missing values.
I used the function head almost with all the function when we call the data to reduce the the length of the page
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(plyr)
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
library(flexdashboard)
setwd("E:/R-work")
getwd()
## [1] "E:/R-work"
flourideData <- read.csv("E:/R-work/flouride.csv", header=T, sep=",")
arsenicData <- read.csv("E:/R-work/arsenic.csv", header=T, sep=",")
head(flourideData)
## location n_wells_tested percent_wells_above_guideline median
## 1 Otis 60 30.0 1.130
## 2 Dedham 102 22.5 0.940
## 3 Denmark 46 19.6 0.450
## 4 Surry 175 18.3 0.800
## 5 Prospect 57 17.5 0.785
## 6 Eastbrook 31 16.1 1.290
## percentile_95 maximum
## 1 3.200 3.6
## 2 3.270 7.0
## 3 3.150 3.9
## 4 3.525 6.9
## 5 2.500 2.7
## 6 2.445 3.3
head(arsenicData)
## location n_wells_tested percent_wells_above_guideline median
## 1 Manchester 275 58.9 14.0
## 2 Gorham 467 50.1 10.5
## 3 Columbia 42 50.0 9.8
## 4 Monmouth 277 49.5 10.0
## 5 Eliot 73 49.3 9.7
## 6 Columbia Falls 25 48.0 8.1
## percentile_95 maximum
## 1 93.00 200
## 2 130.00 460
## 3 65.90 200
## 4 110.00 368
## 5 41.35 45
## 6 53.75 71
str(flourideData)
## 'data.frame': 917 obs. of 6 variables:
## $ location : Factor w/ 917 levels "Abbot","Acton",..: 451 165 167 585 493 190 381 231 90 577 ...
## $ n_wells_tested : int 60 102 46 175 57 31 32 52 33 56 ...
## $ percent_wells_above_guideline: num 30 22.5 19.6 18.3 17.5 16.1 15.6 15.4 15.2 14.3 ...
## $ median : num 1.13 0.94 0.45 0.8 0.785 1.29 0.6 0.76 0.265 0.6 ...
## $ percentile_95 : num 3.2 3.27 3.15 3.52 2.5 ...
## $ maximum : num 3.6 7 3.9 6.9 2.7 3.3 6.1 4.1 4.2 3.3 ...
str(arsenicData)
## 'data.frame': 917 obs. of 6 variables:
## $ location : Factor w/ 917 levels "Abbot","Acton",..: 363 239 135 395 198 136 909 255 97 66 ...
## $ n_wells_tested : int 275 467 42 277 73 25 424 65 334 241 ...
## $ percent_wells_above_guideline: num 58.9 50.1 50 49.5 49.3 48 44.8 44.6 43.4 42.7 ...
## $ median : num 14 10.5 9.8 10 9.7 8.1 8.2 8.6 6 7 ...
## $ percentile_95 : num 93 130 65.9 110 41.4 ...
## $ maximum : num 200 460 200 368 45 71 240 431 670 930 ...
summary(flourideData)
## location n_wells_tested percent_wells_above_guideline
## Abbot : 1 Min. : 0.00 Min. : 0.000
## Acton : 1 1st Qu.: 0.00 1st Qu.: 0.000
## Adamstown Twp: 1 Median : 6.00 Median : 0.600
## Addison : 1 Mean : 38.17 Mean : 2.448
## Albany Twp : 1 3rd Qu.: 49.00 3rd Qu.: 3.125
## Albion : 1 Max. :503.00 Max. :30.000
## (Other) :911 NA's :557
## median percentile_95 maximum
## Min. :0.1000 Min. :0.1000 Min. : 0.0500
## 1st Qu.:0.1000 1st Qu.:0.5195 1st Qu.: 0.4225
## Median :0.1000 Median :0.9855 Median : 1.3000
## Mean :0.1762 Mean :1.1471 Mean : 1.8987
## 3rd Qu.:0.2000 3rd Qu.:1.5995 3rd Qu.: 2.9000
## Max. :1.2900 Max. :4.4400 Max. :14.0000
## NA's :557 NA's :557 NA's :363
summary(arsenicData)
## location n_wells_tested percent_wells_above_guideline
## Abbot : 1 Min. : 0.00 Min. : 0.000
## Acton : 1 1st Qu.: 0.00 1st Qu.: 3.225
## Adamstown Twp: 1 Median : 5.00 Median : 8.300
## Addison : 1 Mean : 33.99 Mean :12.411
## Albany Twp : 1 3rd Qu.: 41.00 3rd Qu.:18.375
## Albion : 1 Max. :632.00 Max. :58.900
## (Other) :911 NA's :575
## median percentile_95 maximum
## Min. : 0.250 Min. : 0.500 Min. : 0.00
## 1st Qu.: 0.500 1st Qu.: 6.265 1st Qu.: 6.20
## Median : 1.000 Median : 13.650 Median : 24.00
## Mean : 1.617 Mean : 25.550 Mean : 67.35
## 3rd Qu.: 1.887 3rd Qu.: 28.350 3rd Qu.: 64.00
## Max. :14.000 Max. :372.500 Max. :3100.00
## NA's :575 NA's :575 NA's :364
library(ggvis)
##
## Attaching package: 'ggvis'
## The following objects are masked from 'package:plotly':
##
## add_data, hide_legend
## The following object is masked from 'package:ggplot2':
##
## resolution
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
flourideData %>% ggvis(~n_wells_tested , ~percent_wells_above_guideline) %>% layer_points()
arsenicData %>% ggvis(~n_wells_tested , ~percent_wells_above_guideline) %>% layer_points()
flourideData <- read.csv("E:/R-work/flouride.csv", header=T, sep=",")
library(knitr)
library(dplyr)
head(flourideData %>% select(WellsLocation = location, NumbersWellsTested = n_wells_tested, PercWellsAboveGuideline = percent_wells_above_guideline ,median,percentile_95,maximum))
## WellsLocation NumbersWellsTested PercWellsAboveGuideline median
## 1 Otis 60 30.0 1.130
## 2 Dedham 102 22.5 0.940
## 3 Denmark 46 19.6 0.450
## 4 Surry 175 18.3 0.800
## 5 Prospect 57 17.5 0.785
## 6 Eastbrook 31 16.1 1.290
## percentile_95 maximum
## 1 3.200 3.6
## 2 3.270 7.0
## 3 3.150 3.9
## 4 3.525 6.9
## 5 2.500 2.7
## 6 2.445 3.3
head(flourideData %>% rename(Wells_Location = location, Numbers_Wells_Tested = n_wells_tested)%>% arrange(desc(median)))
## Wells_Location Numbers_Wells_Tested percent_wells_above_guideline median
## 1 Eastbrook 31 16.1 1.290
## 2 Otis 60 30.0 1.130
## 3 Marshfield 31 12.9 1.000
## 4 Dedham 102 22.5 0.940
## 5 Surry 175 18.3 0.800
## 6 Prospect 57 17.5 0.785
## percentile_95 maximum
## 1 2.445 3.3
## 2 3.200 3.6
## 3 3.570 4.4
## 4 3.270 7.0
## 5 3.525 6.9
## 6 2.500 2.7
head(flourideData %>% transmute(name = paste(location, n_wells_tested,percent_wells_above_guideline), dlist = median >= 0.5))
## name dlist
## 1 Otis 60 30 TRUE
## 2 Dedham 102 22.5 TRUE
## 3 Denmark 46 19.6 FALSE
## 4 Surry 175 18.3 TRUE
## 5 Prospect 57 17.5 TRUE
## 6 Eastbrook 31 16.1 TRUE
flourideData %>% filter(row_number(desc(median)) == 5)
## location n_wells_tested percent_wells_above_guideline median
## 1 Surry 175 18.3 0.8
## percentile_95 maximum
## 1 3.525 6.9
arsenicData %>% filter(row_number(desc(median)) == 3)
## location n_wells_tested percent_wells_above_guideline median
## 1 Monmouth 277 49.5 10
## percentile_95 maximum
## 1 110 368
flourideData %>% arrange(desc(median)) %>% slice(6)
## # A tibble: 1 x 6
## location n_wells_tested percent_wells_above_guideline median
## <fctr> <int> <dbl> <dbl>
## 1 Prospect 57 17.5 0.785
## # ... with 2 more variables: percentile_95 <dbl>, maximum <dbl>
head(flourideData %>% select(location, n_wells_tested,percent_wells_above_guideline, median) %>% arrange(desc(median)) %>%
mutate(p_rank = percent_rank(median), cdist = cume_dist(median),
ntile = ntile(median, 4)))
## location n_wells_tested percent_wells_above_guideline median p_rank
## 1 Eastbrook 31 16.1 1.290 1.0000000
## 2 Otis 60 30.0 1.130 0.9972145
## 3 Marshfield 31 12.9 1.000 0.9944290
## 4 Dedham 102 22.5 0.940 0.9916435
## 5 Surry 175 18.3 0.800 0.9888579
## 6 Prospect 57 17.5 0.785 0.9860724
## cdist ntile
## 1 1.0000000 4
## 2 0.9972222 4
## 3 0.9944444 4
## 4 0.9916667 4
## 5 0.9888889 4
## 6 0.9861111 4
head(arsenicData %>% arrange(desc(median)) %>% mutate(nxt_better = lag(median), nxt_worst = lead(median)))
## location n_wells_tested percent_wells_above_guideline median
## 1 Manchester 275 58.9 14.0
## 2 Gorham 467 50.1 10.5
## 3 Monmouth 277 49.5 10.0
## 4 Columbia 42 50.0 9.8
## 5 Eliot 73 49.3 9.7
## 6 Hallowell 65 44.6 8.6
## percentile_95 maximum nxt_better nxt_worst
## 1 93.00 200 NA 10.5
## 2 130.00 460 14.0 10.0
## 3 110.00 368 10.5 9.8
## 4 65.90 200 10.0 9.7
## 5 41.35 45 9.8 8.6
## 6 100.00 431 9.7 8.2
library(ggvis)
library(ggplot2)
ggplot(arsenicData, aes(x = median, y = n_wells_tested)) +
geom_point() +
geom_smooth()# Copy the above command but show only the smooth line
## `geom_smooth()` using method = 'loess'
## Warning: Removed 575 rows containing non-finite values (stat_smooth).
## Warning: Removed 575 rows containing missing values (geom_point).
ggplot(arsenicData, aes(x = median, y = n_wells_tested)) +
geom_smooth()# Copy the above command and assign the correct value to col in aes()
## `geom_smooth()` using method = 'loess'
## Warning: Removed 575 rows containing non-finite values (stat_smooth).
ggplot(arsenicData, aes(x = median, y = n_wells_tested, col = percent_wells_above_guideline)) +
geom_smooth()# Keep the color settings from previous command. Plot only the points with argument alpha.
## `geom_smooth()` using method = 'loess'
## Warning: Removed 575 rows containing non-finite values (stat_smooth).
ggplot(arsenicData, aes(x = median, y = n_wells_tested, col = percent_wells_above_guideline)) +
geom_point(alpha = 0.3)
## Warning: Removed 575 rows containing missing values (geom_point).
ars_plot = ggplot(arsenicData, aes(x=median, y= n_wells_tested))# Add a geom layer with + and geom_point()
ars_plot + geom_point()# Add the same geom layer, but with aes() inside
## Warning: Removed 575 rows containing missing values (geom_point).
ars_plot + geom_point(aes(col = percent_wells_above_guideline))
## Warning: Removed 575 rows containing missing values (geom_point).
flourideData %>% ggvis(~n_wells_tested, ~percentile_95) %>% layer_points() %>% layer_smooths()
arsenicData %>% ggvis(~n_wells_tested, ~percentile_95) %>% layer_points() %>% layer_smooths()
flourideData %>% ggvis(~n_wells_tested, ~percent_wells_above_guideline) %>% layer_points() %>% layer_smooths()
arsenicData %>% ggvis(~n_wells_tested, ~percent_wells_above_guideline) %>% layer_points() %>% layer_smooths()
flourideData %>% ggvis(~ n_wells_tested, ~percent_wells_above_guideline ) %>% layer_bars()
arsenicData %>% ggvis(~ n_wells_tested, ~percent_wells_above_guideline ) %>% layer_bars()
flourideData %>% ggvis(~n_wells_tested) %>% layer_histograms()
## Guessing width = 20 # range / 26
arsenicData %>% ggvis(~n_wells_tested) %>% layer_histograms()
## Guessing width = 20 # range / 32
flourideData %>% ggvis(~n_wells_tested, fill := "green") %>% layer_densities()
arsenicData %>% ggvis(~n_wells_tested, fill := "red") %>% layer_densities()