Assignment 1

The State of Maine Health and Environmental Testing Laboratory provided these data. The table was prepared by the Maine Environmental Public Health Tracking Program. The complete data set contains water test results from 46,855 private wells in Maine. Revision Date: 08/2015.

The fields included in both csv files include:

Read the dataset

using the function read.csv to read the data from our directory.
uing the function head to disply first six row from our data.
flourideData <- read.csv("E:/R-work/flouride.csv", header=T, sep=",")
arsenicData <- read.csv("E:/R-work/arsenic.csv", header=T, sep=",")
head(flourideData)
##    location n_wells_tested percent_wells_above_guideline median
## 1      Otis             60                          30.0  1.130
## 2    Dedham            102                          22.5  0.940
## 3   Denmark             46                          19.6  0.450
## 4     Surry            175                          18.3  0.800
## 5  Prospect             57                          17.5  0.785
## 6 Eastbrook             31                          16.1  1.290
##   percentile_95 maximum
## 1         3.200     3.6
## 2         3.270     7.0
## 3         3.150     3.9
## 4         3.525     6.9
## 5         2.500     2.7
## 6         2.445     3.3
head(arsenicData)
##         location n_wells_tested percent_wells_above_guideline median
## 1     Manchester            275                          58.9   14.0
## 2         Gorham            467                          50.1   10.5
## 3       Columbia             42                          50.0    9.8
## 4       Monmouth            277                          49.5   10.0
## 5          Eliot             73                          49.3    9.7
## 6 Columbia Falls             25                          48.0    8.1
##   percentile_95 maximum
## 1         93.00     200
## 2        130.00     460
## 3         65.90     200
## 4        110.00     368
## 5         41.35      45
## 6         53.75      71

using the function str to disply the structure of the data

str(flourideData)
## 'data.frame':    917 obs. of  6 variables:
##  $ location                     : Factor w/ 917 levels "Abbot","Acton",..: 451 165 167 585 493 190 381 231 90 577 ...
##  $ n_wells_tested               : int  60 102 46 175 57 31 32 52 33 56 ...
##  $ percent_wells_above_guideline: num  30 22.5 19.6 18.3 17.5 16.1 15.6 15.4 15.2 14.3 ...
##  $ median                       : num  1.13 0.94 0.45 0.8 0.785 1.29 0.6 0.76 0.265 0.6 ...
##  $ percentile_95                : num  3.2 3.27 3.15 3.52 2.5 ...
##  $ maximum                      : num  3.6 7 3.9 6.9 2.7 3.3 6.1 4.1 4.2 3.3 ...
str(arsenicData)
## 'data.frame':    917 obs. of  6 variables:
##  $ location                     : Factor w/ 917 levels "Abbot","Acton",..: 363 239 135 395 198 136 909 255 97 66 ...
##  $ n_wells_tested               : int  275 467 42 277 73 25 424 65 334 241 ...
##  $ percent_wells_above_guideline: num  58.9 50.1 50 49.5 49.3 48 44.8 44.6 43.4 42.7 ...
##  $ median                       : num  14 10.5 9.8 10 9.7 8.1 8.2 8.6 6 7 ...
##  $ percentile_95                : num  93 130 65.9 110 41.4 ...
##  $ maximum                      : num  200 460 200 368 45 71 240 431 670 930 ...

summary function for both dataset flourideData and arsenicData shows us the statistics datiles like min,max mean ….

summary(flourideData)
##           location   n_wells_tested   percent_wells_above_guideline
##  Abbot        :  1   Min.   :  0.00   Min.   : 0.000               
##  Acton        :  1   1st Qu.:  0.00   1st Qu.: 0.000               
##  Adamstown Twp:  1   Median :  6.00   Median : 0.600               
##  Addison      :  1   Mean   : 38.17   Mean   : 2.448               
##  Albany Twp   :  1   3rd Qu.: 49.00   3rd Qu.: 3.125               
##  Albion       :  1   Max.   :503.00   Max.   :30.000               
##  (Other)      :911                    NA's   :557                  
##      median       percentile_95       maximum       
##  Min.   :0.1000   Min.   :0.1000   Min.   : 0.0500  
##  1st Qu.:0.1000   1st Qu.:0.5195   1st Qu.: 0.4225  
##  Median :0.1000   Median :0.9855   Median : 1.3000  
##  Mean   :0.1762   Mean   :1.1471   Mean   : 1.8987  
##  3rd Qu.:0.2000   3rd Qu.:1.5995   3rd Qu.: 2.9000  
##  Max.   :1.2900   Max.   :4.4400   Max.   :14.0000  
##  NA's   :557      NA's   :557      NA's   :363
summary(arsenicData)
##           location   n_wells_tested   percent_wells_above_guideline
##  Abbot        :  1   Min.   :  0.00   Min.   : 0.000               
##  Acton        :  1   1st Qu.:  0.00   1st Qu.: 3.225               
##  Adamstown Twp:  1   Median :  5.00   Median : 8.300               
##  Addison      :  1   Mean   : 33.99   Mean   :12.411               
##  Albany Twp   :  1   3rd Qu.: 41.00   3rd Qu.:18.375               
##  Albion       :  1   Max.   :632.00   Max.   :58.900               
##  (Other)      :911                    NA's   :575                  
##      median       percentile_95        maximum       
##  Min.   : 0.250   Min.   :  0.500   Min.   :   0.00  
##  1st Qu.: 0.500   1st Qu.:  6.265   1st Qu.:   6.20  
##  Median : 1.000   Median : 13.650   Median :  24.00  
##  Mean   : 1.617   Mean   : 25.550   Mean   :  67.35  
##  3rd Qu.: 1.887   3rd Qu.: 28.350   3rd Qu.:  64.00  
##  Max.   :14.000   Max.   :372.500   Max.   :3100.00  
##  NA's   :575      NA's   :575       NA's   :364
load the library ggvis for the pipe function.
plot both dataset using ggvis pakage
library(ggvis)
## 
## Attaching package: 'ggvis'
## The following objects are masked from 'package:plotly':
## 
##     add_data, hide_legend
## The following object is masked from 'package:ggplot2':
## 
##     resolution
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
flourideData %>% ggvis(~n_wells_tested , ~percent_wells_above_guideline) %>% layer_points()
arsenicData %>% ggvis(~n_wells_tested , ~percent_wells_above_guideline) %>% layer_points()
flourideData <- read.csv("E:/R-work/flouride.csv", header=T, sep=",")
using the function select from the pakage dplyer to select chooses specific columns.
library(knitr)
library(dplyr)
head(flourideData %>% select(WellsLocation = location, NumbersWellsTested = n_wells_tested, PercWellsAboveGuideline = percent_wells_above_guideline ,median,percentile_95,maximum))
##   WellsLocation NumbersWellsTested PercWellsAboveGuideline median
## 1          Otis                 60                    30.0  1.130
## 2        Dedham                102                    22.5  0.940
## 3       Denmark                 46                    19.6  0.450
## 4         Surry                175                    18.3  0.800
## 5      Prospect                 57                    17.5  0.785
## 6     Eastbrook                 31                    16.1  1.290
##   percentile_95 maximum
## 1         3.200     3.6
## 2         3.270     7.0
## 3         3.150     3.9
## 4         3.525     6.9
## 5         2.500     2.7
## 6         2.445     3.3
using the function rename from the pakage dplyer to renames specific columns and selects all.
head(flourideData %>% rename(Wells_Location = location, Numbers_Wells_Tested = n_wells_tested)%>% arrange(desc(median)))
##   Wells_Location Numbers_Wells_Tested percent_wells_above_guideline median
## 1      Eastbrook                   31                          16.1  1.290
## 2           Otis                   60                          30.0  1.130
## 3     Marshfield                   31                          12.9  1.000
## 4         Dedham                  102                          22.5  0.940
## 5          Surry                  175                          18.3  0.800
## 6       Prospect                   57                          17.5  0.785
##   percentile_95 maximum
## 1         2.445     3.3
## 2         3.200     3.6
## 3         3.570     4.4
## 4         3.270     7.0
## 5         3.525     6.9
## 6         2.500     2.7
head(flourideData %>% transmute(name = paste(location, n_wells_tested,percent_wells_above_guideline), dlist = median >= 0.5))
##                name dlist
## 1        Otis 60 30  TRUE
## 2   Dedham 102 22.5  TRUE
## 3   Denmark 46 19.6 FALSE
## 4    Surry 175 18.3  TRUE
## 5  Prospect 57 17.5  TRUE
## 6 Eastbrook 31 16.1  TRUE
using the function select from the pakage dplyer to chooses specific rows..
flourideData %>% filter(row_number(desc(median)) == 5)
##   location n_wells_tested percent_wells_above_guideline median
## 1    Surry            175                          18.3    0.8
##   percentile_95 maximum
## 1         3.525     6.9
arsenicData %>% filter(row_number(desc(median)) == 3)
##   location n_wells_tested percent_wells_above_guideline median
## 1 Monmouth            277                          49.5     10
##   percentile_95 maximum
## 1           110     368
using the function select from the pakage dplyer to slice selects rows by position.
flourideData %>% arrange(desc(median)) %>% slice(6)
## # A tibble: 1 x 6
##   location n_wells_tested percent_wells_above_guideline median
##     <fctr>          <int>                         <dbl>  <dbl>
## 1 Prospect             57                          17.5  0.785
## # ... with 2 more variables: percentile_95 <dbl>, maximum <dbl>
using the function select from the pakage (rank ,arrange and distinct returns unique rows.)
head(flourideData %>% select(location, n_wells_tested,percent_wells_above_guideline, median) %>% arrange(desc(median)) %>% 
  mutate(p_rank = percent_rank(median), cdist = cume_dist(median), 
         ntile = ntile(median, 4)))
##     location n_wells_tested percent_wells_above_guideline median    p_rank
## 1  Eastbrook             31                          16.1  1.290 1.0000000
## 2       Otis             60                          30.0  1.130 0.9972145
## 3 Marshfield             31                          12.9  1.000 0.9944290
## 4     Dedham            102                          22.5  0.940 0.9916435
## 5      Surry            175                          18.3  0.800 0.9888579
## 6   Prospect             57                          17.5  0.785 0.9860724
##       cdist ntile
## 1 1.0000000     4
## 2 0.9972222     4
## 3 0.9944444     4
## 4 0.9916667     4
## 5 0.9888889     4
## 6 0.9861111     4
head(arsenicData %>% arrange(desc(median)) %>% mutate(nxt_better = lag(median), nxt_worst = lead(median)))
##     location n_wells_tested percent_wells_above_guideline median
## 1 Manchester            275                          58.9   14.0
## 2     Gorham            467                          50.1   10.5
## 3   Monmouth            277                          49.5   10.0
## 4   Columbia             42                          50.0    9.8
## 5      Eliot             73                          49.3    9.7
## 6  Hallowell             65                          44.6    8.6
##   percentile_95 maximum nxt_better nxt_worst
## 1         93.00     200         NA      10.5
## 2        130.00     460       14.0      10.0
## 3        110.00     368       10.5       9.8
## 4         65.90     200       10.0       9.7
## 5         41.35      45        9.8       8.6
## 6        100.00     431        9.7       8.2

Visualization with ggplot2

library(ggvis)
library(ggplot2)
ggplot(arsenicData, aes(x = median, y = n_wells_tested)) +
geom_point() +
geom_smooth()# Copy the above command but show only the smooth line
## `geom_smooth()` using method = 'loess'
## Warning: Removed 575 rows containing non-finite values (stat_smooth).
## Warning: Removed 575 rows containing missing values (geom_point).

ggplot(arsenicData, aes(x = median, y = n_wells_tested)) +
geom_smooth()# Copy the above command and assign the correct value to col in aes()
## `geom_smooth()` using method = 'loess'
## Warning: Removed 575 rows containing non-finite values (stat_smooth).

ggplot(arsenicData, aes(x = median, y = n_wells_tested, col = percent_wells_above_guideline)) +
geom_smooth()# Keep the color settings from previous command. Plot only the points with argument alpha.
## `geom_smooth()` using method = 'loess'
## Warning: Removed 575 rows containing non-finite values (stat_smooth).

ggplot(arsenicData, aes(x = median, y = n_wells_tested, col = percent_wells_above_guideline)) +
geom_point(alpha = 0.3)
## Warning: Removed 575 rows containing missing values (geom_point).

ars_plot = ggplot(arsenicData, aes(x=median, y= n_wells_tested))# Add a geom layer with + and geom_point()
ars_plot + geom_point()# Add the same geom layer, but with aes() inside
## Warning: Removed 575 rows containing missing values (geom_point).

ars_plot + geom_point(aes(col = percent_wells_above_guideline))
## Warning: Removed 575 rows containing missing values (geom_point).

flourideData %>% ggvis(~n_wells_tested, ~percentile_95) %>% layer_points() %>% layer_smooths()
arsenicData %>% ggvis(~n_wells_tested, ~percentile_95) %>% layer_points() %>% layer_smooths()
flourideData %>% ggvis(~n_wells_tested, ~percent_wells_above_guideline) %>% layer_points() %>% layer_smooths()
arsenicData %>% ggvis(~n_wells_tested, ~percent_wells_above_guideline) %>% layer_points() %>% layer_smooths()
flourideData %>% ggvis(~ n_wells_tested, ~percent_wells_above_guideline ) %>% layer_bars()
arsenicData %>% ggvis(~ n_wells_tested, ~percent_wells_above_guideline ) %>% layer_bars()

density for both dataset

flourideData %>% ggvis(~n_wells_tested) %>% layer_histograms()
## Guessing width = 20 # range / 26
arsenicData %>% ggvis(~n_wells_tested) %>% layer_histograms()
## Guessing width = 20 # range / 32
flourideData %>% ggvis(~n_wells_tested, fill := "green") %>% layer_densities()
arsenicData %>% ggvis(~n_wells_tested, fill := "red") %>% layer_densities()