library(DMwR2)
## Warning: package 'DMwR2' was built under R version 4.2.1
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
data(algae, package ="DMwR2")
mean(algae$a1)
## [1] 16.9235
mean(algae$NO3)
## [1] NA
mean(algae$NO3, na.rm=TRUE)
## [1] 3.282389
median(algae$a3)
## [1] 1.55
median(algae$mxPH, na.rm=TRUE)
## [1] 8.06
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
alg<-tbl_df(algae)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
summarise(alg, avgN03=mean(NO3, na.rm=TRUE), medA1=median(a1))
## # A tibble: 1 × 2
##   avgN03 medA1
##    <dbl> <dbl>
## 1   3.28  6.95
select(alg, mxPH:Cl)%>% summarise_each(funs(mean(.,na.rm=TRUE), median(.,na.rm=TRUE)))
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 1 × 6
##   mxPH_mean mnO2_mean Cl_mean mxPH_median mnO2_median Cl_median
##       <dbl>     <dbl>   <dbl>       <dbl>       <dbl>     <dbl>
## 1      8.01      9.12    43.6        8.06         9.8      32.7
#Taking the warning into consideration
#create an tibble with no NAs
alg<-as_tibble(algae)
alg1<-na.omit(alg)
alg1%>% summarise(across(mxPH:Cl,list( mean=mean, median=median)))
## # A tibble: 1 × 6
##   mxPH_mean mxPH_median mnO2_mean mnO2_median Cl_mean Cl_median
##       <dbl>       <dbl>     <dbl>       <dbl>   <dbl>     <dbl>
## 1      8.08         8.1      9.02        9.75    44.9      35.1
group_by(alg, season, size) %>%summarize(n0bs=n(), mA7=median(a7)) %>% ungroup()%>% arrange(desc(mA7))
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 4
##    season size    n0bs   mA7
##    <fct>  <fct>  <int> <dbl>
##  1 spring large     12  1.95
##  2 summer small     14  1.45
##  3 winter medium    26  1.4 
##  4 autumn medium    16  1.05
##  5 spring medium    21  1   
##  6 summer medium    21  1   
##  7 autumn large     11  0   
##  8 autumn small     13  0   
##  9 spring small     20  0   
## 10 summer large     10  0   
## 11 winter large     12  0   
## 12 winter small     24  0
## R Code for function mode (returns the most frequent value)
Mode<-function(x, na.rm=FALSE){
if(na.rm) x<-x[!is.na(x)] 
ux<-unique(x)
return(ux[which.max(tabulate(match(x,ux)))])
}
Mode(algae$mxPH, na.rm=TRUE)
## [1] 8
##Applying it across more attributes
alg%>% summarise(across(mxPH:NH4, ~Mode(., na.rm=TRUE)))
## # A tibble: 1 × 5
##    mxPH  mnO2    Cl   NO3   NH4
##   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     8   9.8     7  1.32    10
Mode(algae$season)
## [1] winter
## Levels: autumn spring summer winter
alg%>% summarise(across(season:speed, ~Mode(., na.rm=TRUE)))
## # A tibble: 1 × 3
##   season size   speed
##   <fct>  <fct>  <fct>
## 1 winter medium high
centralValue(algae$a1)
## [1] 6.95
centralValue(algae$speed)
## [1] "high"
var(algae$a1)
## [1] 455.7532
sd(algae$Cl, na.rm=TRUE)
## [1] 46.83131
IQR(algae$mxPH, na.rm=TRUE)
## [1] 0.7
quantile(algae$a3)
##     0%    25%    50%    75%   100% 
##  0.000  0.000  1.550  4.925 42.800
quantile(algae$a3, probs = c(.2,.8))
##  20%  80% 
## 0.00 7.06
range(algae$a1)
## [1]  0.0 89.8
##for factors we may use levels
levels(algae$season)
## [1] "autumn" "spring" "summer" "winter"
max(algae$a5)-min(algae$a5)
## [1] 44.4
select(alg, a1:a7)%>% summarise_each(funs(var))
## # A tibble: 1 × 7
##      a1    a2    a3    a4    a5    a6    a7
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  456.  122.  48.3  19.5  56.1  136.  26.6
data("iris")
## Book states error when using group_by with function returning non-scalar
##However it runs
group_by(iris, Species)%>% summarise((qs=quantile(Sepal.Length)))
## `summarise()` has grouped output by 'Species'. You can override using the
## `.groups` argument.
## # A tibble: 15 × 2
## # Groups:   Species [3]
##    Species    `(qs = quantile(Sepal.Length))`
##    <fct>                                <dbl>
##  1 setosa                                4.3 
##  2 setosa                                4.8 
##  3 setosa                                5   
##  4 setosa                                5.2 
##  5 setosa                                5.8 
##  6 versicolor                            4.9 
##  7 versicolor                            5.6 
##  8 versicolor                            5.9 
##  9 versicolor                            6.3 
## 10 versicolor                            7   
## 11 virginica                             4.9 
## 12 virginica                             6.22
## 13 virginica                             6.5 
## 14 virginica                             6.9 
## 15 virginica                             7.9
## Using aggregate from Base R :results are prettier
aggregate(iris$Sepal.Length,list(Species=iris$Species), quantile)
##      Species  x.0% x.25% x.50% x.75% x.100%
## 1     setosa 4.300 4.800 5.000 5.200  5.800
## 2 versicolor 4.900 5.600 5.900 6.300  7.000
## 3  virginica 4.900 6.225 6.500 6.900  7.900
## Dealing with NAs and outliers
nasRow<-apply(algae, 1, function(r) sum(is.na(r)))
cat("The Algae dataset contains ", sum(nasRow), " NA values \n")
## The Algae dataset contains  33  NA values
cat("There are ", sum(!complete.cases(algae)), " rows that have at least one NA value \n")
## There are  16  rows that have at least one NA value
## Boxplot Rule to detect outlier values
bpRule<-function(x, const=1.5, positions=FALSE){
  x<-x[!is.na(x)] 
  qs<-quantile(x, probs = c(.25, 0.75)) 
  iqr<-qs[2]-qs[1] 
  if (!positions) x[x<qs[1]-const*iqr| x>qs[2]+const*iqr]
  else which(x<qs[1]-const*iqr| x> qs[2]+const*iqr)
  }
bpRule(algae$a1)
##  [1] 69.9 74.2 66.0 75.8 89.8 81.9 82.7 66.9 64.2 64.9 64.3 86.6
#Displaying outliers
bpRule(algae$NO3)
## [1] 10.416  9.248  9.773  9.715 45.650
#Displaying the indexes of the outliers
bpRule(algae$NO3, positions=TRUE)
## [1]   5   6 139 144 152
data(iris)
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(iris)
## iris 
## 
##  5  Variables      150  Observations
## --------------------------------------------------------------------------------
## Sepal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       35    0.998    5.843   0.9462    4.600    4.800 
##      .25      .50      .75      .90      .95 
##    5.100    5.800    6.400    6.900    7.255 
## 
## lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
## --------------------------------------------------------------------------------
## Sepal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       23    0.992    3.057   0.4872    2.345    2.500 
##      .25      .50      .75      .90      .95 
##    2.800    3.000    3.300    3.610    3.800 
## 
## lowest : 2.0 2.2 2.3 2.4 2.5, highest: 3.9 4.0 4.1 4.2 4.4
## --------------------------------------------------------------------------------
## Petal.Length 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       43    0.998    3.758    1.979     1.30     1.40 
##      .25      .50      .75      .90      .95 
##     1.60     4.35     5.10     5.80     6.10 
## 
## lowest : 1.0 1.1 1.2 1.3 1.4, highest: 6.3 6.4 6.6 6.7 6.9
## --------------------------------------------------------------------------------
## Petal.Width 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      150        0       22     0.99    1.199   0.8676      0.2      0.2 
##      .25      .50      .75      .90      .95 
##      0.3      1.3      1.8      2.2      2.3 
## 
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 2.1 2.2 2.3 2.4 2.5
## --------------------------------------------------------------------------------
## Species 
##        n  missing distinct 
##      150        0        3 
##                                            
## Value          setosa versicolor  virginica
## Frequency          50         50         50
## Proportion      0.333      0.333      0.333
## --------------------------------------------------------------------------------
by(algae[,2:5], algae$season, summary)
## algae$season: autumn
##      size       speed         mxPH            mnO2      
##  large :11   high  :15   Min.   :5.700   Min.   : 6.50  
##  medium:16   low   : 8   1st Qu.:7.588   1st Qu.:10.22  
##  small :13   medium:17   Median :8.060   Median :10.90  
##                          Mean   :7.952   Mean   :10.60  
##                          3rd Qu.:8.400   3rd Qu.:11.43  
##                          Max.   :8.870   Max.   :12.90  
## ------------------------------------------------------------ 
## algae$season: spring
##      size       speed         mxPH            mnO2       
##  large :12   high  :21   Min.   :5.600   Min.   : 1.800  
##  medium:21   low   : 8   1st Qu.:7.790   1st Qu.: 6.000  
##  small :20   medium:24   Median :8.070   Median : 8.900  
##                          Mean   :8.024   Mean   : 8.010  
##                          3rd Qu.:8.400   3rd Qu.: 9.875  
##                          Max.   :9.500   Max.   :12.500  
##                                          NA's   :1       
## ------------------------------------------------------------ 
## algae$season: summer
##      size       speed         mxPH            mnO2       
##  large :10   high  :20   Min.   :6.400   Min.   : 4.400  
##  medium:21   low   : 7   1st Qu.:7.600   1st Qu.: 8.125  
##  small :14   medium:18   Median :8.000   Median :10.100  
##                          Mean   :7.905   Mean   : 9.415  
##                          3rd Qu.:8.200   3rd Qu.:10.875  
##                          Max.   :8.800   Max.   :12.100  
##                                          NA's   :1       
## ------------------------------------------------------------ 
## algae$season: winter
##      size       speed         mxPH            mnO2       
##  large :12   high  :28   Min.   :6.600   Min.   : 1.500  
##  medium:26   low   :10   1st Qu.:7.800   1st Qu.: 7.625  
##  small :24   medium:24   Median :8.100   Median : 9.500  
##                          Mean   :8.119   Mean   : 8.880  
##                          3rd Qu.:8.430   3rd Qu.:10.650  
##                          Max.   :9.700   Max.   :13.400  
##                          NA's   :1

R Markdown

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.