R Notebook

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

plot(cars)

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

A friendly Introduction to R

df<-data.frame(a=c("Ha","Van","Tuyen","Ha","Van"),num=c(1:5))

ha<- df[c("Ha","Tuyen") %in% df$a,]

ha

##       a num
## 1    Ha   1
## 2   Van   2
## 3 Tuyen   3
## 4    Ha   4
## 5   Van   5

Create a random number with an uniform distribution

rd<-runif(100)

hist(rd,col=rainbow(length(rd)))

Matrix operations

# matrix multiplication

a<-matrix(1:6,nrow=3,ncol=4,byrow = T)

b<-matrix(4:9,nrow = 3,ncol=4,byrow = F)

mt<-a%*%t(b) # matrix a times matrix b

rownames(mt)<-paste("row_",1:nrow(mt)) # assign row name

colnames(mt)<-paste("col_",1:ncol(mt)) # assign column names

mt

##        col_ 1 col_ 2 col_ 3
## row_ 1     58     68     78
## row_ 2     80     94    108
## row_ 3    102    120    138

# check files

list.files()

##  [1] "ad.html"             "ad.nb.html"          "ad.pdf"             
##  [4] "ad.Rmd"              "as.nb.html"          "as.Rmd"             
##  [7] "AS.tif"              "das.Rmd"             "das_files"          
## [10] "dsa.html"            "dsa.nb.html"         "dsa.Rmd"            
## [13] "dsad.html"           "dsad.nb.html"        "dsad.Rmd"           
## [16] "ew.html"             "ew.Rmd"              "Joint_dplyr.html"   
## [19] "Joint_dplyr.nb.html" "Joint_dplyr.Rmd"     "my_iris.csv"        
## [22] "QGIS Saving.osm"     "Rmark.nb.html"       "Rmark.Rmd"          
## [25] "rsconnect"           "tu1.html"            "tu1.pdf"            
## [28] "tu1.Rmd"             "tuyemap.html"        "tuyemap.nb.html"    
## [31] "tuyemap.Rmd"         "tuyen1_files"        "tuyen3_files"       
## [34] "tuyen5.Rmd"          "tuyen6.Rmd"          "tuyen6_files"       
## [37] "tuyen7.nb.html"      "tuyen7.Rmd"          "Yen Data"

list.files(pattern = ".csv$",full.names = T)

## [1] "./my_iris.csv"

Set the value of 4 to NA value

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

mt<-mtcars
# Set the value of 4 in ` Gear` =NA

i<- mt$gear==4

mt$gear[i]<-NA

# another way 

mt$gear[mt$gear==4]<-NA

head(mt)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1   NA    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1   NA    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1   NA    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

# get a unique set of values 

library(caret)

## Warning: package 'caret' was built under R version 3.2.5

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.2.5

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.2.5

unique(iris$Species) # provide a unique label of species in the dataset

## [1] setosa     versicolor virginica 
## Levels: setosa versicolor virginica

table(iris$Species) # calculate the number of each species

## 
##     setosa versicolor  virginica 
##         50         50         50

# change name of levels

iris$Species<-as.character(iris$Species)

iris$Species[iris$Species %in% "setosa"]<-"Setosa"  # This one works for character

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  Setosa
## 2          4.9         3.0          1.4         0.2  Setosa
## 3          4.7         3.2          1.3         0.2  Setosa
## 4          4.6         3.1          1.5         0.2  Setosa
## 5          5.0         3.6          1.4         0.2  Setosa
## 6          5.4         3.9          1.7         0.4  Setosa

# Change a certain character in a word

iris$Species<-gsub("v","V",iris$Species)

tail(iris)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 Virginica
## 146          6.7         3.0          5.2         2.3 Virginica
## 147          6.3         2.5          5.0         1.9 Virginica
## 148          6.5         3.0          5.2         2.0 Virginica
## 149          6.2         3.4          5.4         2.3 Virginica
## 150          5.9         3.0          5.1         1.8 Virginica

Using function

# First function

f<- function(name){
  x<-paste("Hello",name,"!")
  
  return(x)
}

f("Tuyen")

## [1] "Hello Tuyen !"

# Calculating area of rectangular

f1<- function(x,y){
  
  area<-x*y
  
  area<-paste(area,"square metter")
  
  return(area)
}

f1(20,30)

## [1] "600 square metter"

# Calculate area of circle 

f2<- function(r){
  area<-pi*4^2
  
  return(round(area,2))
}

f2(8)

## [1] 50.27

apply family function

# create a matrix

my_matrix<- matrix(1:10,ncol=5,nrow=5,byrow = T)

# apply function, `1` indicates by row (calcuating mean of matrix by row)

apply(my_matrix,1,mean)

## [1] 3 8 3 8 3

# another way to mean by row

rowMeans(my_matrix) # other useful functions like sum, std, min,max,range,median, and quantile

## [1] 3 8 3 8 3

# `2` indicates for column

apply(my_matrix,2,mean)

## [1] 3 4 5 6 7

colMeans(my_matrix)

## [1] 3 4 5 6 7

tapply can be used to calculate statistics for group. It needs to have one categorical variable

library(caret)

tapply(iris$Sepal.Length,iris$Species,mean,na.rm=T)

##     Setosa Versicolor  Virginica 
##      5.006      5.936      6.588

tapply(iris$Sepal.Width,iris$Species,mean,na.rm=T)

##     Setosa Versicolor  Virginica 
##      3.428      2.770      2.974

aggregate function is similar to tapply, but calculate statistics for more variables

# first way

aggregate(iris[,-5],iris[,5,drop=F],mean)

##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     Setosa        5.006       3.428        1.462       0.246
## 2 Versicolor        5.936       2.770        4.260       1.326
## 3  Virginica        6.588       2.974        5.552       2.026

# another way

aggregate(iris[,c(1:4)],list(iris$Species),mean,na.rm=T)

##      Group.1 Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     Setosa        5.006       3.428        1.462       0.246
## 2 Versicolor        5.936       2.770        4.260       1.326
## 3  Virginica        6.588       2.974        5.552       2.026

lapply and sapply functions. While lapply returns a list, sapply returns a vector or matrix
for loop function. for loop iterates number of times predifined

fp<- for (i in 1:10){
  print("Hello my friend")
}

## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"
## [1] "Hello my friend"

# break and next

for (i in 1:10){
  if ( i %in% c(1,2,3,7)){
    next 
  } 
  if (i>9){
    break
  }
  print(i)
}

## [1] 4
## [1] 5
## [1] 6
## [1] 8
## [1] 9

Basic plot

plot(cars$speed~cars$dist,axes=F,xlab="Distance",ylab="Speed",main="Scatter Plot")

axis(1)

axis(2)

text(20,25,"Cars")

# logistic regression

cars$above30<- cars$dist>30 # create True and False values

head(cars)

##   speed dist above30
## 1     4    2   FALSE
## 2     4   10   FALSE
## 3     7    4   FALSE
## 4     7   22   FALSE
## 5     8   16   FALSE
## 6     9   10   FALSE

glm1<-glm(above30~speed, data=cars, family = "binomial")

# Using predict function to predict 1:30

p<- predict(glm1, newdata = data.frame(speed=c(1:30)),type="response")


plot(above30~speed,data=cars, main="Logistic Regression Plot",col=c(2,3))

lines(1:30,p,col=4)

# Enlarge plot size 

par(mar=rep(2,4))
# create some random numbers
x<-rnorm(10)

y<-rnorm(10)

set.seed(123)

value<-runif(length(x))*5

plot(y~x,cex=value, pch=16,col=as.numeric(value))

text(x,y,LETTERS[1:10],pos=3)

longitude <- c(-116.7, -120.4, -116.7, -113.5, -115.5, -120.8, -119.5, -113.7, -113.7, -110.7)
latitude <- c(45.3, 42.6, 38.9, 42.1, 35.7, 38.9, 36.2, 39, 41.6, 36.9)

value<-runif(10)

lonlat <- cbind(longitude, latitude)

class(lonlat)

## [1] "matrix"

# We already created a matrix 'lonlat'

library(sp)

## Warning: package 'sp' was built under R version 3.2.5

# we can create a spatialpoints. This can be used for matrix

pt<- SpatialPoints(lonlat)

class(pt)

## [1] "SpatialPoints"
## attr(,"package")
## [1] "sp"

# Access their attributes

showDefault(pt)

## An object of class "SpatialPoints"
## Slot "coords":
##       longitude latitude
##  [1,]    -116.7     45.3
##  [2,]    -120.4     42.6
##  [3,]    -116.7     38.9
##  [4,]    -113.5     42.1
##  [5,]    -115.5     35.7
##  [6,]    -120.8     38.9
##  [7,]    -119.5     36.2
##  [8,]    -113.7     39.0
##  [9,]    -113.7     41.6
## [10,]    -110.7     36.9
## 
## Slot "bbox":
##              min    max
## longitude -120.8 -110.7
## latitude    35.7   45.3
## 
## Slot "proj4string":
## CRS arguments: NA

# if longlat contains in a data.frame, we can use 'coordinates(df)<-~x+y

# asign coordinate sytem to SpatialPoints

pt <- SpatialPoints(lonlat, proj4string=CRS('+proj=longlat +datum=WGS84'))

pt

## SpatialPoints:
##       longitude latitude
##  [1,]    -116.7     45.3
##  [2,]    -120.4     42.6
##  [3,]    -116.7     38.9
##  [4,]    -113.5     42.1
##  [5,]    -115.5     35.7
##  [6,]    -120.8     38.9
##  [7,]    -119.5     36.2
##  [8,]    -113.7     39.0
##  [9,]    -113.7     41.6
## [10,]    -110.7     36.9
## Coordinate Reference System (CRS) arguments: +proj=longlat
## +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0

# Create a spatial point data frame

df<- data.frame(Id=1:nrow(lonlat),value=value)

my_pt<- SpatialPointsDataFrame(pt,data=df)

my_pt

##       coordinates Id      value
## 1  (-116.7, 45.3)  1 0.95683335
## 2  (-120.4, 42.6)  2 0.45333416
## 3  (-116.7, 38.9)  3 0.67757064
## 4  (-113.5, 42.1)  4 0.57263340
## 5  (-115.5, 35.7)  5 0.10292468
## 6  (-120.8, 38.9)  6 0.89982497
## 7  (-119.5, 36.2)  7 0.24608773
## 8    (-113.7, 39)  8 0.04205953
## 9  (-113.7, 41.6)  9 0.32792072
## 10 (-110.7, 36.9) 10 0.95450365

# we can also produce `splines`,sppolygons` from raster package

library(raster)

## Warning: package 'raster' was built under R version 3.2.5

sl<-spline(lonlat)

sp<-spPolygons(lonlat,crs = ("+proj=longlat +datum=WGS84"))

plot(sp,axes=T)

plot(sp,col=3,border=4,add=T)

# If polygon shpefile has no coordinate system, we can assign it
library(sp)

# example   crs(my_shapefile)<-CRS("+proj=longlat +datum = WGS84") 

# remove it

#   crs(my_shapefile)<-NA

# Another way to assign coordinate to sp object from x,y data

x<-runif(10)

y<-runif(10)

df<-data.frame(x,y)


coordinates(df)<-~x+y

proj4string(df)<-CRS("+proj=longlat +datum=WGS84")

df

## class       : SpatialPoints 
## features    : 10 
## extent      : 0.1471136, 0.9942698, 0.02461368, 0.9630242  (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0