This tutorial is to provide basic instructions of applied statistics for beginners using R programming language. While this material is not intended to offer a comprehensive document to students, it is actually to help them to have an overview of statistics and R basic functionalities. This paper will cover from very basics of statistics to very advanced statistical learning modelling.

Introduction to the Basics of Applied Statistical Modelling

  1. Introduction to the Instructor and Course

I am Tuyen, a master student in Environmental Management Program at Massey University, New Zealand. I am about to complete my master in this December, and gonna work at Faculty of Resource Management, Thai Nguyen University of Agriculture and Forestry.

My research of interest is in the application of GIS and remote sensing in natural protected areas and statistical learning algorithms in land use/land cover detection.

I am interested in sports including swimming, voleyball and badminton

That’s it about me :) and any questions?

– Understand a population of interest? how to define a population of interest?

– Understand a sample and inference? why do we want to have a representative sample?

– How to collect sample (techniques)?

Question: The government announced that 52% of people voted to leave EU? What do you think of this sentence in terms of statistic context?

== Statistical studies: Observationa and experiement studies

Introduction to R programming

** Getting hand in R

# Numeric data

a<-c(1:100)

length(a) # The length of a
## [1] 100
typeof(a) # Data type of a
## [1] "integer"
a[68] # Index the value of 68
## [1] 68
# Categorical data

x<-c(1,"4","Ha")

typeof(x)
## [1] "character"
length(x)
## [1] 3
# Sequence of numbers

a<-seq(1,10,by=2.5)

a
## [1] 1.0 3.5 6.0 8.5
typeof(a)
## [1] "double"
length(a)
## [1] 4
typeof(as.integer(a)) # Convert it to be integer
## [1] "integer"
# Matrix 

m<-matrix(1:6,nrow=2,ncol=3)

typeof(m)
## [1] "integer"

Data used in this tutorial

data(package="datasets")

An example of dataset available in R

df<-ChickWeight # Assign a name to given dataset

head(df)

Some useful functions in R

str(df)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame':   578 obs. of  4 variables:
##  $ weight: num  42 51 59 64 76 93 106 125 149 171 ...
##  $ Time  : num  0 2 4 6 8 10 12 14 16 18 ...
##  $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ Diet  : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "formula")=Class 'formula'  language weight ~ Time | Chick
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "outer")=Class 'formula'  language ~Diet
##   .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
##  - attr(*, "labels")=List of 2
##   ..$ x: chr "Time"
##   ..$ y: chr "Body weight"
##  - attr(*, "units")=List of 2
##   ..$ x: chr "(days)"
##   ..$ y: chr "(gm)"
dim(df)
## [1] 578   4
plot(df)

An example

mydf<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/Diamonds.csv",header=T)

head(mydf)
# Set working directory first

library(readxl)
## Warning: package 'readxl' was built under R version 3.4.2
# This is to read excel dataset      df_exel<-read_excel("newdata.xls")

Indexing and subseting

library(MASS)

head(iris)
summary(iris) # Summarize the whole dataset
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
summary(iris$Sepal.Length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.300   5.100   5.800   5.843   6.400   7.900
myiris1<-iris[,1:3]

head(myiris1)

-Another way of subseting

# Exclude column species

var<-names(iris) %in% c("Species")

df_irs<-iris[!var]

head(df_irs)
# Another way around

df_iris1<-iris[,-c(1,2)]

head(df_iris1)

How to subset all columns and Species==Setosa and Verstosa

new_iris<-iris[iris$Species=="setosa"|iris$Species=="versicolor",]

dim(new_iris)
## [1] 100   5
head(new_iris)
tail(new_iris)

More about dataset

library(MASS)

head(airquality)
# Dealing with missing data 

myair<-na.omit(airquality)

dim(myair)
## [1] 111   6
head(myair)
# Another way
df_unmissing<-airquality[complete.cases(airquality),]

dim(df_unmissing)
## [1] 111   6
# Dealing with missing data

missing<-airquality[!complete.cases(airquality),]

head(missing)
#  Replace missing data by mean

df1<-airquality

df1$Solar.R[is.na(df1$Solar.R)]<-mean(df1$Solar.R,na.rm = T)

head(df1)
df2<-airquality

df2.fix<-ifelse(is.na(df2$Solar.R),mean(df2$Solar.R,na.rm = T),df2$Solar.R)

df2$Solar.R<-df2.fix

head(df2)
# Visualize the pattern of missing data

library(mice)
## Warning: package 'mice' was built under R version 3.4.2
## Loading required package: lattice
library(VIM)
## Warning: package 'VIM' was built under R version 3.4.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
pl<-aggr(airquality,col=c(2,3,4,5),numbers=T, sortVars=T,labels=names(airquality),cex.axis=0.8,gap=3,ylab=c("Missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable      Count
##     Ozone 0.24183007
##   Solar.R 0.04575163
##      Wind 0.00000000
##      Temp 0.00000000
##     Month 0.00000000
##       Day 0.00000000
library(mice)

md.pattern(airquality)
##     Wind Temp Month Day Solar.R Ozone   
## 111    1    1     1   1       1     1  0
##  35    1    1     1   1       1     0  1
##   5    1    1     1   1       0     1  1
##   2    1    1     1   1       0     0  2
##        0    0     0   0       7    37 44
# 500 iterations of predictive mapping for imputing 

my_im<-mice(airquality,m=5,maxit = 50,method = "pmm",seed=12)
## 
##  iter imp variable
##   1   1  Ozone  Solar.R
##   1   2  Ozone  Solar.R
##   1   3  Ozone  Solar.R
##   1   4  Ozone  Solar.R
##   1   5  Ozone  Solar.R
##   2   1  Ozone  Solar.R
##   2   2  Ozone  Solar.R
##   2   3  Ozone  Solar.R
##   2   4  Ozone  Solar.R
##   2   5  Ozone  Solar.R
##   3   1  Ozone  Solar.R
##   3   2  Ozone  Solar.R
##   3   3  Ozone  Solar.R
##   3   4  Ozone  Solar.R
##   3   5  Ozone  Solar.R
##   4   1  Ozone  Solar.R
##   4   2  Ozone  Solar.R
##   4   3  Ozone  Solar.R
##   4   4  Ozone  Solar.R
##   4   5  Ozone  Solar.R
##   5   1  Ozone  Solar.R
##   5   2  Ozone  Solar.R
##   5   3  Ozone  Solar.R
##   5   4  Ozone  Solar.R
##   5   5  Ozone  Solar.R
##   6   1  Ozone  Solar.R
##   6   2  Ozone  Solar.R
##   6   3  Ozone  Solar.R
##   6   4  Ozone  Solar.R
##   6   5  Ozone  Solar.R
##   7   1  Ozone  Solar.R
##   7   2  Ozone  Solar.R
##   7   3  Ozone  Solar.R
##   7   4  Ozone  Solar.R
##   7   5  Ozone  Solar.R
##   8   1  Ozone  Solar.R
##   8   2  Ozone  Solar.R
##   8   3  Ozone  Solar.R
##   8   4  Ozone  Solar.R
##   8   5  Ozone  Solar.R
##   9   1  Ozone  Solar.R
##   9   2  Ozone  Solar.R
##   9   3  Ozone  Solar.R
##   9   4  Ozone  Solar.R
##   9   5  Ozone  Solar.R
##   10   1  Ozone  Solar.R
##   10   2  Ozone  Solar.R
##   10   3  Ozone  Solar.R
##   10   4  Ozone  Solar.R
##   10   5  Ozone  Solar.R
##   11   1  Ozone  Solar.R
##   11   2  Ozone  Solar.R
##   11   3  Ozone  Solar.R
##   11   4  Ozone  Solar.R
##   11   5  Ozone  Solar.R
##   12   1  Ozone  Solar.R
##   12   2  Ozone  Solar.R
##   12   3  Ozone  Solar.R
##   12   4  Ozone  Solar.R
##   12   5  Ozone  Solar.R
##   13   1  Ozone  Solar.R
##   13   2  Ozone  Solar.R
##   13   3  Ozone  Solar.R
##   13   4  Ozone  Solar.R
##   13   5  Ozone  Solar.R
##   14   1  Ozone  Solar.R
##   14   2  Ozone  Solar.R
##   14   3  Ozone  Solar.R
##   14   4  Ozone  Solar.R
##   14   5  Ozone  Solar.R
##   15   1  Ozone  Solar.R
##   15   2  Ozone  Solar.R
##   15   3  Ozone  Solar.R
##   15   4  Ozone  Solar.R
##   15   5  Ozone  Solar.R
##   16   1  Ozone  Solar.R
##   16   2  Ozone  Solar.R
##   16   3  Ozone  Solar.R
##   16   4  Ozone  Solar.R
##   16   5  Ozone  Solar.R
##   17   1  Ozone  Solar.R
##   17   2  Ozone  Solar.R
##   17   3  Ozone  Solar.R
##   17   4  Ozone  Solar.R
##   17   5  Ozone  Solar.R
##   18   1  Ozone  Solar.R
##   18   2  Ozone  Solar.R
##   18   3  Ozone  Solar.R
##   18   4  Ozone  Solar.R
##   18   5  Ozone  Solar.R
##   19   1  Ozone  Solar.R
##   19   2  Ozone  Solar.R
##   19   3  Ozone  Solar.R
##   19   4  Ozone  Solar.R
##   19   5  Ozone  Solar.R
##   20   1  Ozone  Solar.R
##   20   2  Ozone  Solar.R
##   20   3  Ozone  Solar.R
##   20   4  Ozone  Solar.R
##   20   5  Ozone  Solar.R
##   21   1  Ozone  Solar.R
##   21   2  Ozone  Solar.R
##   21   3  Ozone  Solar.R
##   21   4  Ozone  Solar.R
##   21   5  Ozone  Solar.R
##   22   1  Ozone  Solar.R
##   22   2  Ozone  Solar.R
##   22   3  Ozone  Solar.R
##   22   4  Ozone  Solar.R
##   22   5  Ozone  Solar.R
##   23   1  Ozone  Solar.R
##   23   2  Ozone  Solar.R
##   23   3  Ozone  Solar.R
##   23   4  Ozone  Solar.R
##   23   5  Ozone  Solar.R
##   24   1  Ozone  Solar.R
##   24   2  Ozone  Solar.R
##   24   3  Ozone  Solar.R
##   24   4  Ozone  Solar.R
##   24   5  Ozone  Solar.R
##   25   1  Ozone  Solar.R
##   25   2  Ozone  Solar.R
##   25   3  Ozone  Solar.R
##   25   4  Ozone  Solar.R
##   25   5  Ozone  Solar.R
##   26   1  Ozone  Solar.R
##   26   2  Ozone  Solar.R
##   26   3  Ozone  Solar.R
##   26   4  Ozone  Solar.R
##   26   5  Ozone  Solar.R
##   27   1  Ozone  Solar.R
##   27   2  Ozone  Solar.R
##   27   3  Ozone  Solar.R
##   27   4  Ozone  Solar.R
##   27   5  Ozone  Solar.R
##   28   1  Ozone  Solar.R
##   28   2  Ozone  Solar.R
##   28   3  Ozone  Solar.R
##   28   4  Ozone  Solar.R
##   28   5  Ozone  Solar.R
##   29   1  Ozone  Solar.R
##   29   2  Ozone  Solar.R
##   29   3  Ozone  Solar.R
##   29   4  Ozone  Solar.R
##   29   5  Ozone  Solar.R
##   30   1  Ozone  Solar.R
##   30   2  Ozone  Solar.R
##   30   3  Ozone  Solar.R
##   30   4  Ozone  Solar.R
##   30   5  Ozone  Solar.R
##   31   1  Ozone  Solar.R
##   31   2  Ozone  Solar.R
##   31   3  Ozone  Solar.R
##   31   4  Ozone  Solar.R
##   31   5  Ozone  Solar.R
##   32   1  Ozone  Solar.R
##   32   2  Ozone  Solar.R
##   32   3  Ozone  Solar.R
##   32   4  Ozone  Solar.R
##   32   5  Ozone  Solar.R
##   33   1  Ozone  Solar.R
##   33   2  Ozone  Solar.R
##   33   3  Ozone  Solar.R
##   33   4  Ozone  Solar.R
##   33   5  Ozone  Solar.R
##   34   1  Ozone  Solar.R
##   34   2  Ozone  Solar.R
##   34   3  Ozone  Solar.R
##   34   4  Ozone  Solar.R
##   34   5  Ozone  Solar.R
##   35   1  Ozone  Solar.R
##   35   2  Ozone  Solar.R
##   35   3  Ozone  Solar.R
##   35   4  Ozone  Solar.R
##   35   5  Ozone  Solar.R
##   36   1  Ozone  Solar.R
##   36   2  Ozone  Solar.R
##   36   3  Ozone  Solar.R
##   36   4  Ozone  Solar.R
##   36   5  Ozone  Solar.R
##   37   1  Ozone  Solar.R
##   37   2  Ozone  Solar.R
##   37   3  Ozone  Solar.R
##   37   4  Ozone  Solar.R
##   37   5  Ozone  Solar.R
##   38   1  Ozone  Solar.R
##   38   2  Ozone  Solar.R
##   38   3  Ozone  Solar.R
##   38   4  Ozone  Solar.R
##   38   5  Ozone  Solar.R
##   39   1  Ozone  Solar.R
##   39   2  Ozone  Solar.R
##   39   3  Ozone  Solar.R
##   39   4  Ozone  Solar.R
##   39   5  Ozone  Solar.R
##   40   1  Ozone  Solar.R
##   40   2  Ozone  Solar.R
##   40   3  Ozone  Solar.R
##   40   4  Ozone  Solar.R
##   40   5  Ozone  Solar.R
##   41   1  Ozone  Solar.R
##   41   2  Ozone  Solar.R
##   41   3  Ozone  Solar.R
##   41   4  Ozone  Solar.R
##   41   5  Ozone  Solar.R
##   42   1  Ozone  Solar.R
##   42   2  Ozone  Solar.R
##   42   3  Ozone  Solar.R
##   42   4  Ozone  Solar.R
##   42   5  Ozone  Solar.R
##   43   1  Ozone  Solar.R
##   43   2  Ozone  Solar.R
##   43   3  Ozone  Solar.R
##   43   4  Ozone  Solar.R
##   43   5  Ozone  Solar.R
##   44   1  Ozone  Solar.R
##   44   2  Ozone  Solar.R
##   44   3  Ozone  Solar.R
##   44   4  Ozone  Solar.R
##   44   5  Ozone  Solar.R
##   45   1  Ozone  Solar.R
##   45   2  Ozone  Solar.R
##   45   3  Ozone  Solar.R
##   45   4  Ozone  Solar.R
##   45   5  Ozone  Solar.R
##   46   1  Ozone  Solar.R
##   46   2  Ozone  Solar.R
##   46   3  Ozone  Solar.R
##   46   4  Ozone  Solar.R
##   46   5  Ozone  Solar.R
##   47   1  Ozone  Solar.R
##   47   2  Ozone  Solar.R
##   47   3  Ozone  Solar.R
##   47   4  Ozone  Solar.R
##   47   5  Ozone  Solar.R
##   48   1  Ozone  Solar.R
##   48   2  Ozone  Solar.R
##   48   3  Ozone  Solar.R
##   48   4  Ozone  Solar.R
##   48   5  Ozone  Solar.R
##   49   1  Ozone  Solar.R
##   49   2  Ozone  Solar.R
##   49   3  Ozone  Solar.R
##   49   4  Ozone  Solar.R
##   49   5  Ozone  Solar.R
##   50   1  Ozone  Solar.R
##   50   2  Ozone  Solar.R
##   50   3  Ozone  Solar.R
##   50   4  Ozone  Solar.R
##   50   5  Ozone  Solar.R
# Get back the completed dataset

complete1<-complete(my_im,1)

head(complete1)