library(DMwR2)
## Warning: package 'DMwR2' was built under R version 4.2.1
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data(algae, package ="DMwR2")
mean(algae$a1)
## [1] 16.9235
mean(algae$NO3)
## [1] NA
mean(algae$NO3, na.rm=TRUE)
## [1] 3.282389
median(algae$a3)
## [1] 1.55
median(algae$mxPH, na.rm=TRUE)
## [1] 8.06
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
alg<-tbl_df(algae)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
summarise(alg, avgN03=mean(NO3, na.rm=TRUE), medA1=median(a1))
## # A tibble: 1 × 2
## avgN03 medA1
## <dbl> <dbl>
## 1 3.28 6.95
select(alg, mxPH:Cl)%>% summarise_each(funs(mean(.,na.rm=TRUE), median(.,na.rm=TRUE)))
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 1 × 6
## mxPH_mean mnO2_mean Cl_mean mxPH_median mnO2_median Cl_median
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8.01 9.12 43.6 8.06 9.8 32.7
#Taking the warning into consideration
#create an tibble with no NAs
alg<-as_tibble(algae)
alg1<-na.omit(alg)
alg1%>% summarise(across(mxPH:Cl,list( mean=mean, median=median)))
## # A tibble: 1 × 6
## mxPH_mean mxPH_median mnO2_mean mnO2_median Cl_mean Cl_median
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8.08 8.1 9.02 9.75 44.9 35.1
group_by(alg, season, size) %>%summarize(n0bs=n(), mA7=median(a7)) %>% ungroup()%>% arrange(desc(mA7))
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
## season size n0bs mA7
## <fct> <fct> <int> <dbl>
## 1 spring large 12 1.95
## 2 summer small 14 1.45
## 3 winter medium 26 1.4
## 4 autumn medium 16 1.05
## 5 spring medium 21 1
## 6 summer medium 21 1
## 7 autumn large 11 0
## 8 autumn small 13 0
## 9 spring small 20 0
## 10 summer large 10 0
## 11 winter large 12 0
## 12 winter small 24 0
## R Code for function mode (returns the most frequent value)
Mode<-function(x, na.rm=FALSE){
if(na.rm) x<-x[!is.na(x)]
ux<-unique(x)
return(ux[which.max(tabulate(match(x,ux)))])
}
Mode(algae$mxPH, na.rm=TRUE)
## [1] 8
##Applying it across more attributes
alg%>% summarise(across(mxPH:NH4, ~Mode(., na.rm=TRUE)))
## # A tibble: 1 × 5
## mxPH mnO2 Cl NO3 NH4
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8 9.8 7 1.32 10
Mode(algae$season)
## [1] winter
## Levels: autumn spring summer winter
alg%>% summarise(across(season:speed, ~Mode(., na.rm=TRUE)))
## # A tibble: 1 × 3
## season size speed
## <fct> <fct> <fct>
## 1 winter medium high
centralValue(algae$a1)
## [1] 6.95
centralValue(algae$speed)
## [1] "high"
var(algae$a1)
## [1] 455.7532
sd(algae$Cl, na.rm=TRUE)
## [1] 46.83131
IQR(algae$mxPH, na.rm=TRUE)
## [1] 0.7
quantile(algae$a3)
## 0% 25% 50% 75% 100%
## 0.000 0.000 1.550 4.925 42.800
quantile(algae$a3, probs = c(.2,.8))
## 20% 80%
## 0.00 7.06
range(algae$a1)
## [1] 0.0 89.8
##for factors we may use levels
levels(algae$season)
## [1] "autumn" "spring" "summer" "winter"
max(algae$a5)-min(algae$a5)
## [1] 44.4
select(alg, a1:a7)%>% summarise_each(funs(var))
## # A tibble: 1 × 7
## a1 a2 a3 a4 a5 a6 a7
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 456. 122. 48.3 19.5 56.1 136. 26.6
data("iris")
## Book states error when using group_by with function returning non-scalar
##However it runs
group_by(iris, Species)%>% summarise((qs=quantile(Sepal.Length)))
## `summarise()` has grouped output by 'Species'. You can override using the
## `.groups` argument.
## # A tibble: 15 × 2
## # Groups: Species [3]
## Species `(qs = quantile(Sepal.Length))`
## <fct> <dbl>
## 1 setosa 4.3
## 2 setosa 4.8
## 3 setosa 5
## 4 setosa 5.2
## 5 setosa 5.8
## 6 versicolor 4.9
## 7 versicolor 5.6
## 8 versicolor 5.9
## 9 versicolor 6.3
## 10 versicolor 7
## 11 virginica 4.9
## 12 virginica 6.22
## 13 virginica 6.5
## 14 virginica 6.9
## 15 virginica 7.9
## Using aggregate from Base R :results are prettier
aggregate(iris$Sepal.Length,list(Species=iris$Species), quantile)
## Species x.0% x.25% x.50% x.75% x.100%
## 1 setosa 4.300 4.800 5.000 5.200 5.800
## 2 versicolor 4.900 5.600 5.900 6.300 7.000
## 3 virginica 4.900 6.225 6.500 6.900 7.900
## Dealing with NAs and outliers
nasRow<-apply(algae, 1, function(r) sum(is.na(r)))
cat("The Algae dataset contains ", sum(nasRow), " NA values \n")
## The Algae dataset contains 33 NA values
cat("There are ", sum(!complete.cases(algae)), " rows that have at least one NA value \n")
## There are 16 rows that have at least one NA value
## Boxplot Rule to detect outlier values
bpRule<-function(x, const=1.5, positions=FALSE){
x<-x[!is.na(x)]
qs<-quantile(x, probs = c(.25, 0.75))
iqr<-qs[2]-qs[1]
if (!positions) x[x<qs[1]-const*iqr| x>qs[2]+const*iqr]
else which(x<qs[1]-const*iqr| x> qs[2]+const*iqr)
}
bpRule(algae$a1)
## [1] 69.9 74.2 66.0 75.8 89.8 81.9 82.7 66.9 64.2 64.9 64.3 86.6
#Displaying outliers
bpRule(algae$NO3)
## [1] 10.416 9.248 9.773 9.715 45.650
#Displaying the indexes of the outliers
bpRule(algae$NO3, positions=TRUE)
## [1] 5 6 139 144 152
data(iris)
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(iris)
## iris
##
## 5 Variables 150 Observations
## --------------------------------------------------------------------------------
## Sepal.Length
## n missing distinct Info Mean Gmd .05 .10
## 150 0 35 0.998 5.843 0.9462 4.600 4.800
## .25 .50 .75 .90 .95
## 5.100 5.800 6.400 6.900 7.255
##
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## --------------------------------------------------------------------------------
## Sepal.Width
## n missing distinct Info Mean Gmd .05 .10
## 150 0 23 0.992 3.057 0.4872 2.345 2.500
## .25 .50 .75 .90 .95
## 2.800 3.000 3.300 3.610 3.800
##
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## --------------------------------------------------------------------------------
## Petal.Length
## n missing distinct Info Mean Gmd .05 .10
## 150 0 43 0.998 3.758 1.979 1.30 1.40
## .25 .50 .75 .90 .95
## 1.60 4.35 5.10 5.80 6.10
##
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## --------------------------------------------------------------------------------
## Petal.Width
## n missing distinct Info Mean Gmd .05 .10
## 150 0 22 0.99 1.199 0.8676 0.2 0.2
## .25 .50 .75 .90 .95
## 0.3 1.3 1.8 2.2 2.3
##
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## --------------------------------------------------------------------------------
## Species
## n missing distinct
## 150 0 3
##
## Value setosa versicolor virginica
## Frequency 50 50 50
## Proportion 0.333 0.333 0.333
## --------------------------------------------------------------------------------
by(algae[,2:5], algae$season, summary)
## algae$season: autumn
## size speed mxPH mnO2
## large :11 high :15 Min. :5.700 Min. : 6.50
## medium:16 low : 8 1st Qu.:7.588 1st Qu.:10.22
## small :13 medium:17 Median :8.060 Median :10.90
## Mean :7.952 Mean :10.60
## 3rd Qu.:8.400 3rd Qu.:11.43
## Max. :8.870 Max. :12.90
## ------------------------------------------------------------
## algae$season: spring
## size speed mxPH mnO2
## large :12 high :21 Min. :5.600 Min. : 1.800
## medium:21 low : 8 1st Qu.:7.790 1st Qu.: 6.000
## small :20 medium:24 Median :8.070 Median : 8.900
## Mean :8.024 Mean : 8.010
## 3rd Qu.:8.400 3rd Qu.: 9.875
## Max. :9.500 Max. :12.500
## NA's :1
## ------------------------------------------------------------
## algae$season: summer
## size speed mxPH mnO2
## large :10 high :20 Min. :6.400 Min. : 4.400
## medium:21 low : 7 1st Qu.:7.600 1st Qu.: 8.125
## small :14 medium:18 Median :8.000 Median :10.100
## Mean :7.905 Mean : 9.415
## 3rd Qu.:8.200 3rd Qu.:10.875
## Max. :8.800 Max. :12.100
## NA's :1
## ------------------------------------------------------------
## algae$season: winter
## size speed mxPH mnO2
## large :12 high :28 Min. :6.600 Min. : 1.500
## medium:26 low :10 1st Qu.:7.800 1st Qu.: 7.625
## small :24 medium:24 Median :8.100 Median : 9.500
## Mean :8.119 Mean : 8.880
## 3rd Qu.:8.430 3rd Qu.:10.650
## Max. :9.700 Max. :13.400
## NA's :1
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.