# Read in csv files
library(readr)
#CSV_File <- read_csv("F:/R_Training/Ecoli_Data_set(Auckland).csv")
#CSV_File <- read_csv("C:/Users/Jorn/Documents/R_training/Ecoli_Data_set(HBRC).csv")
CSV_File <- read_csv('C:\\Users\\Jorn\\Documents\\R_training\\Ecoli_Data_set(HBRC).csv')
## Parsed with column specification:
## cols(
##   Agency = col_character(),
##   SiteName = col_character(),
##   Name = col_character(),
##   NumItems = col_integer(),
##   TSType = col_character(),
##   DataType = col_character(),
##   Interpolation = col_character(),
##   ItemNumber = col_integer(),
##   ItemName = col_character(),
##   ItemFormat = col_logical(),
##   Units = col_character(),
##   Format = col_character(),
##   DateFormat = col_character(),
##   NumItems2 = col_integer(),
##   T = col_character(),
##   I1 = col_double(),
##   I2 = col_character()
## )
View(CSV_File)

#Subset based on less columns
CSV_File_Subset1 <- (CSV_File[,c("SiteName", "Name", "T","I1", "Units")])
View(CSV_File_Subset1)

#Subset based on a query of values example 1
View(subset(CSV_File_Subset1, SiteName == "Wairoa River at Railway Br."))

#Subset based on a query of values example 1
View(subset(CSV_File_Subset1, SiteName == "Wairoa River at Railway Br." & Name =="E. Coli"))

#Subset based on a query of values example 2
View(subset(CSV_File_Subset1, SiteName == "Wairoa River at Railway Br." & Name =="E. Coli" & I1 > 1000))

#Subset based on a query of values example 3
View(subset(CSV_File_Subset1, SiteName == "Wairoa River at Railway Br."  & Name =="E. Coli" & I1 <  50 | I1 > 1000))

#Subset based on a query of values example 4
View(subset(CSV_File_Subset1, SiteName == "Wairoa River at Railway Br."  & Name =="E. Coli" & I1 >  150 & I1 < 800))

#View Unique values of the a column/field example 1
View(unique(CSV_File_Subset1$SiteName))

#View Unique values of the a column/field example 2
View(unique(c(CSV_File_Subset1$SiteName,CSV_File_Subset1$Name)))

#Stats data set
Statsdataset <- (subset(CSV_File_Subset1, SiteName == "Wairoa River at Railway Br." & Name == "Dissolved Reactive Phosphorus"))
View(Statsdataset)

#Minimum measured Dissolved Reactive Phosphorus
min(Statsdataset$I1)
## [1] 0.004
#Maximum measured Dissolved Reactive Phosphorus
max(Statsdataset$I1)
## [1] 0.043
#Mean measured Dissolved Reactive Phosphorus
mean(Statsdataset$I1)
## [1] 0.01074048
#Mediam measured Dissolved Reactive Phosphorus
median(Statsdataset$I1)
## [1] 0.0085
#Summary measured Dissolved Reactive Phosphorus
summary(Statsdataset$I1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00400 0.00415 0.00850 0.01074 0.01432 0.04300
#Boxplot example (1 variable & 1 site)
library(ggplot2) # geom_boxplot proposes several arguments to custom appearance
ggplot(Statsdataset, aes(x=Name, y=I1)) + 
  geom_boxplot(
    
    # custom boxes
    color="blue",
    fill="blue",
    alpha=0.2,
    
    # Notch?
    notch=TRUE,
    notchwidth = 0.8,
    
    # custom outliers
    outlier.colour="red",
    outlier.fill="red",
    outlier.size=3
    
  )

#Subset based on a query of values example 1
Statsdataset2 <- (subset(CSV_File_Subset1, Name =="E. Coli"))
View(Statsdataset2)

# geom_boxplot proposes several arguments to custom appearance
ggplot(Statsdataset2, aes(x=reorder(SiteName, I1), y=I1, fill=SiteName)) + 
  geom_boxplot() +
  xlab("SiteName") +
  theme(legend.position="none")

#Graph
library(plyr) # required library to rename fields/headings/columns
Statsdataset3 <- rename(Statsdataset2, c("SiteName"="Site Name", "Name"="Measurement", "T"="Time", "I1"="E. Coli (cfu/100ml)"))
View(Statsdataset3)
xlt <- as.POSIXlt(Statsdataset3$Time)
xlt$year
##   [1] -1870 -1885 -1886 -1884 -1884 -1878 -1886 -1871 -1885 -1873 -1873
##  [12] -1869 -1872 -1884 -1874 -1885 -1893 -1889 -1889 -1880 -1888 -1883
##  [23] -1891 -1875 -1869 -1870 -1878 -1871 -1885 -1879 -1894 -1874 -1878
##  [34] -1874 -1892 -1881 -1880 -1897 -1877 -1875 -1883 -1872 -1873 -1869
##  [45] -1871 -1880 -1872 -1875 -1873 -1870 -1871 -1876 -1895 -1897 -1891
##  [56] -1869 -1875 -1871 -1899 -1893 -1874 -1897 -1893 -1872 -1870 -1894
##  [67] -1899 -1878 -1874 -1878 -1869 -1873 -1880 -1872 -1879 -1893 -1872
##  [78] -1874 -1878 -1879 -1884 -1892 -1883 -1879 -1869 -1871 -1870 -1872
##  [89] -1872 -1871 -1874 -1873 -1875 -1881 -1881 -1878 -1870 -1873 -1875
## [100] -1872 -1882 -1871 -1872 -1870 -1870 -1880 -1874 -1880 -1872 -1873
## [111] -1895 -1874 -1874 -1870 -1873 -1899 -1870 -1889 -1870 -1872 -1873
## [122] -1871 -1871 -1877 -1888 -1872 -1875 -1870 -1895 -1895 -1883 -1880
## [133] -1882 -1870 -1876 -1889 -1891 -1885 -1887 -1873 -1888 -1897 -1889
## [144] -1884 -1878 -1887 -1883 -1885 -1880 -1884 -1877 -1889 -1889 -1878
## [155] -1881 -1884 -1883 -1898 -1875 -1877 -1879 -1882 -1874 -1878 -1878
## [166] -1883 -1872 -1888 -1871 -1874 -1869 -1872 -1873 -1877 -1879 -1880
## [177] -1899 -1879 -1879 -1883 -1880 -1883 -1885 -1881 -1884 -1879 -1879
## [188] -1883 -1878 -1874 -1880 -1885 -1873 -1884 -1877 -1874 -1875 -1879
## [199] -1881 -1889 -1873 -1876 -1898 -1870 -1873 -1884 -1880 -1890 -1884
## [210] -1887 -1882 -1890 -1872 -1876 -1869 -1880 -1874 -1873 -1881 -1875
## [221] -1874 -1882 -1892 -1895 -1895 -1898 -1876 -1895 -1897 -1885 -1891
## [232] -1896 -1883 -1893 -1897 -1883 -1887 -1899 -1882 -1871 -1882 -1892
## [243] -1887 -1886 -1874 -1893 -1894 -1885 -1899 -1888 -1898 -1874 -1879
## [254] -1883 -1899 -1895 -1879 -1874 -1871 -1881 -1884 -1883 -1893 -1890
## [265] -1897 -1881 -1891 -1886 -1882 -1873 -1874 -1876 -1897 -1880 -1883
## [276] -1886 -1889 -1870 -1874 -1877 -1888 -1885 -1886 -1889 -1883 -1894
Statsdataset3$Time <- as.Date(Statsdataset3$Time , "%d/%m/%y")
View(as.Date(Statsdataset3$Time , "%d/%m/%y"))
min(Statsdataset3$Time)
## [1] "2020-01-08"
max(Statsdataset3$Time)
## [1] "2020-12-22"
qplot(x=`Site Name` , y=`E. Coli (cfu/100ml)` , data=Statsdataset3 , geom=c("boxplot","jitter") , fill=`Site Name`) +
  ggtitle("E. Coli Measurements")