x = c(1,2,3,4,5)
print(x)[1] 1 2 3 4 5
x <- c(1,2,3,4,5)
print(x)[1] 1 2 3 4 5
Reference:
R from http://cran.r-project.org/
RStudio from http://rstudio.com
Resources * An Introduction to Statistical Learning
x = c(1,2,3,4,5)
print(x)[1] 1 2 3 4 5
x <- c(1,2,3,4,5)
print(x)[1] 1 2 3 4 5
x = c(1,2,3,4,5)
y = c(1,2,3,4,5)
z = x+y # 2 4 6 8 10
print(z)[1] 2 4 6 8 10
z = z+2 # 4 6 8 10 12
print(z)[1] 4 6 8 10 12
x = c(1,2,3,4,5)
k = c(2,3)
print("x+k is Error")[1] "x+k is Error"
x = c(1,2,3,4,5)
k = c(2)
z = x+k
print(z)[1] 3 4 5 6 7
x <- c(1,2,3,4,5)
length(x)[1] 5
y = c(1,2,3,4,5)
length(y) [1] 5
k = c(2)
length(k) [1] 1
print(ls())[1] "k" "x" "y" "z"
rm(k, z)
print(ls())[1] "x" "y"
rm(list=ls())
print(ls())character(0)
f = function(x, y) x^2 + y^2
print(f(10, 10))[1] 200
f = function(x, y){ # Shift+Enter
z = x^2 + y^2
z
}
print(f(10, 10))[1] 200
?matrix
x = matrix(data=c(1, 2, 3, 4), nrow=2, ncol=2)
print(x) [,1] [,2]
[1,] 1 3
[2,] 2 4
x = matrix(c(1, 2, 3, 4), 2, 2)
print(x) [,1] [,2]
[1,] 1 3
[2,] 2 4
x = matrix(nrow=2, ncol=2, data=c(1, 2, 3, 4))
print(x) [,1] [,2]
[1,] 1 3
[2,] 2 4
x = matrix(data=c(1, 2, 3, 4), nrow=2, ncol=2)
dim(x)[1] 2 2
x = 1:5
y = x
z = x %o% y # outer product of vectors
print(z) [,1] [,2] [,3] [,4] [,5]
[1,] 1 2 3 4 5
[2,] 2 4 6 8 10
[3,] 3 6 9 12 15
[4,] 4 8 12 16 20
[5,] 5 10 15 20 25
x = 1:5
y = x
z = outer(x,y)
print(z) [,1] [,2] [,3] [,4] [,5]
[1,] 1 2 3 4 5
[2,] 2 4 6 8 10
[3,] 3 6 9 12 15
[4,] 4 8 12 16 20
[5,] 5 10 15 20 25
x = 1:5
y = x
z = outer(x, y, "+")
print(z) [,1] [,2] [,3] [,4] [,5]
[1,] 2 3 4 5 6
[2,] 3 4 5 6 7
[3,] 4 5 6 7 8
[4,] 5 6 7 8 9
[5,] 6 7 8 9 10
x = 1:5
y = x
f = function(x, y) x^2 + y^2
z = outer(x, y, f)
print(z) [,1] [,2] [,3] [,4] [,5]
[1,] 2 5 10 17 26
[2,] 5 8 13 20 29
[3,] 10 13 18 25 34
[4,] 17 20 25 32 41
[5,] 26 29 34 41 50
x <- 4
x = sqrt(x)x <- 2
x = x^2
print(x)[1] 4
x = rnorm(5)
print(x)[1] 0.1607087 -0.3911897 0.6331766 1.1846232 -0.5664601
y = rnorm(5, mean=50, sd=5)
print(y)[1] 50.44063 57.69443 50.75348 51.55136 47.94516
x = rnorm(5)
y = rnorm(5, mean=50, sd=5)
cor(x, y)[1] 0.2691936
x = rnorm(5);
print(x)[1] -0.07537298 -1.31504100 0.84138315 0.12311147 0.77717263
x = rnorm(5);
print(x)[1] 0.92607212 1.54560344 1.55898802 0.93918978 0.05363251
set.seed(5); x = rnorm(5); x[1] -0.84085548 1.38435934 -1.25549186 0.07014277 1.71144087
set.seed(5); x = rnorm(5); x[1] -0.84085548 1.38435934 -1.25549186 0.07014277 1.71144087
mean(x); var(x); sd(x)[1] 0.2139191
[1] 1.726223
[1] 1.313858
# rnorm() normal variables with mean 0 and std 1
x = rnorm(100)
y = rnorm(100)
plot(x,y)plot(x, y, xlim=c(0,2), ylim=c(0,2))# "l" for lines and "b" for both points and lines
plot(x, y, type="p") # "l" for lines and "b" for both points and lines
plot(x, y, type="b") # xlabel and ylabel
plot(x,y, main="Scatter Plot", xlab="x-axis", ylab="y-axis")pdf(file = "xy.pdf")
plot(x,y, main="xy plot", xlab="x-axis", ylab="y-axis")
dev.off() # complete plottingquartz_off_screen
2
jpeg(file = "xy.jpg")
plot(x,y, main="xy plot", xlab="x-axis", ylab="y-axis")
dev.off() # complete plottingquartz_off_screen
2
x = seq(1, 10)
x [1] 1 2 3 4 5 6 7 8 9 10
x = 1:10
x [1] 1 2 3 4 5 6 7 8 9 10
x = seq(1, 10, length=5)
x[1] 1.00 3.25 5.50 7.75 10.00
x <- seq(-pi, pi, length = 5)
x[1] -3.141593 -1.570796 0.000000 1.570796 3.141593
First dimension: a vector of the x values
Second dimension: a vector of the y values
Third dimension: a matrix of the z values whose elements correspond to each pair of (x, y) coordinates
x = 1:10
y = x
f = function(x, y) cos(y) / (1 + x^2)
z = outer(x, y, f)persp(x, y, z)persp(x, y, z, theta=30)persp(x, y, z, theta=30, phi=70)persp(x, y, z, theta=30, phi=40)image(x, y, z)A = matrix(1:16, 4, 4)
A [,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
[4,] 4 8 12 16
A[4,4][1] 16
A[c(1,2,3), c(1,2,3)] [,1] [,2] [,3]
[1,] 1 5 9
[2,] 2 6 10
[3,] 3 7 11
A[1:3, 1:3] [,1] [,2] [,3]
[1,] 1 5 9
[2,] 2 6 10
[3,] 3 7 11
A[4,][1] 4 8 12 16
A[,4][1] 13 14 15 16
A[1:3,] [,1] [,2] [,3] [,4]
[1,] 1 5 9 13
[2,] 2 6 10 14
[3,] 3 7 11 15
A[,1:3] [,1] [,2] [,3]
[1,] 1 5 9
[2,] 2 6 10
[3,] 3 7 11
[4,] 4 8 12
A[-1,] [,1] [,2] [,3] [,4]
[1,] 2 6 10 14
[2,] 3 7 11 15
[3,] 4 8 12 16
A[-c(1,2),] [,1] [,2] [,3] [,4]
[1,] 3 7 11 15
[2,] 4 8 12 16
A[-c(1,4),] [,1] [,2] [,3] [,4]
[1,] 2 6 10 14
[2,] 3 7 11 15
#install.packages("ISLR2")
library(ISLR2)
Auto = ISLR2::Auto
#Auto = read.csv("data/auto.csv")head(Auto) mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
5 17 8 302 140 3449 10.5 70 1
6 15 8 429 198 4341 10.0 70 1
name
1 chevrolet chevelle malibu
2 buick skylark 320
3 plymouth satellite
4 amc rebel sst
5 ford torino
6 ford galaxie 500
dim(Auto)[1] 392 9
Auto[1:4, ] mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
name
1 chevrolet chevelle malibu
2 buick skylark 320
3 plymouth satellite
4 amc rebel sst
Auto <- na.omit(Auto)
dim(Auto)[1] 392 9
names(Auto)[1] "mpg" "cylinders" "displacement" "horsepower" "weight"
[6] "acceleration" "year" "origin" "name"
x = 1:3
names(x) = c("a","b","c")
xa b c
1 2 3
RStudio -> Session → Set Working Directory -> source file location
getwd()
setwd(“/Users/data”)
na.strings
stringsAsFactors = T
Auto = read.csv("data/auto.csv", header = T, na.strings = c("?"), stringsAsFactors = T)
names(Auto)[1] "mpg" "cylinders" "displacement" "horsepower" "weight"
[6] "acceleration" "year" "origin" "name"
Auto[1:4, ] mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
name
1 chevrolet chevelle malibu
2 buick skylark 320
3 plymouth satellite
4 amc rebel sst
Auto <- na.omit(Auto)
dim(Auto)[1] 392 9
library(ISLR2)
Auto = ISLR2::Auto
names(Auto)[1] "mpg" "cylinders" "displacement" "horsepower" "weight"
[6] "acceleration" "year" "origin" "name"
plot(Auto$cylinders , Auto$mpg)attach(Auto)
plot(cylinders , mpg)detach()Auto$cylinders <- as.factor(Auto$cylinders)
plot(Auto$cylinders , Auto$mpg)plot(Auto$cylinders , Auto$mpg , col="red")plot(Auto$cylinders, Auto$mpg , col="red", varwidth=T)plot(Auto$cylinders, Auto$mpg , col="red", varwidth=T, horizontal=T)plot(Auto$cylinders, Auto$mpg, col="red", varwidth=T, xlab="cylinders", ylab="MPG")hist(Auto$mpg)hist(Auto$mpg , col=2) # redhist(Auto$mpg , col=2, breaks=15)pairs(Auto)pairs(~ mpg + displacement + horsepower + weight + acceleration, data=Auto)# Label some points with a variable
names(Auto) # Field names[1] "mpg" "cylinders" "displacement" "horsepower" "weight"
[6] "acceleration" "year" "origin" "name"
plot(Auto$horsepower , Auto$mpg)
identify(Auto$horsepower, Auto$mpg, Auto$name) #integer(0)
summary(Auto) mpg cylinders displacement horsepower weight
Min. : 9.00 3: 4 Min. : 68.0 Min. : 46.0 Min. :1613
1st Qu.:17.00 4:199 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
Median :22.75 5: 3 Median :151.0 Median : 93.5 Median :2804
Mean :23.45 6: 83 Mean :194.4 Mean :104.5 Mean :2978
3rd Qu.:29.00 8:103 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
Max. :46.60 Max. :455.0 Max. :230.0 Max. :5140
acceleration year origin name
Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
(Other) :365
summary(Auto$mpg) Min. 1st Qu. Median Mean 3rd Qu. Max.
9.00 17.00 22.75 23.45 29.00 46.60
ggplot(df)
aes()library(ggplot2)
dim(diamonds)[1] 53940 10
head(diamonds)# A tibble: 6 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
#View(diamonds)
names(diamonds) [1] "carat" "cut" "color" "clarity" "depth" "table" "price"
[8] "x" "y" "z"
# "carat" "cut" "color" "clarity" "depth" "table" "price" "x" "y" "z" # if only the dataset is known
ggplot(diamonds) # if only X-axis is known. The Y-axis can be specified in respective geoms
ggplot(diamonds, aes(x=carat)) # if both X and Y axes are fixed for all layers
ggplot(diamonds,
aes(x=carat, y=price))# Each category of the 'cut' variable will now have a distinct color, once a geom is added
ggplot(diamonds,
aes(x=carat, color=cut))# Fix color and will not vary based on dataframe variable
ggplot(diamonds,
aes(x=carat),
color="steelblue")# Adding scatterplot geom (layer1) and smoothing geom (layer2)
ggplot(diamonds,
aes(x=carat, y=price,
color=cut)) +
geom_point() +
geom_smooth()`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Same as above but specifying the aesthetics inside the geoms
ggplot(diamonds) +
geom_point(aes(x=carat, y=price,
color=cut)) +
geom_smooth(aes(x=carat, y=price,
color=cut)) `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Remove color from geom_smooth for one smoothing line
ggplot(diamonds) +
geom_point(aes(x=carat, y=price,
color=cut)) +
geom_smooth(aes(x=carat, y=price))`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# same but simpler
ggplot(diamonds, aes(x=carat,
y=price)) +
geom_point(aes(color=cut)) +
geom_smooth()`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# shape of the points vary with color feature?
ggplot(diamonds,
aes(x=carat, y=price,
color=cut, shape=color)) +
geom_point()Warning: Using shapes for an ordinal variable is not advised
Warning: The shape palette can deal with a maximum of 6 discrete values because
more than 6 becomes difficult to discriminate; you have 7. Consider
specifying shapes manually if you must have them.
Warning: Removed 2808 rows containing missing values (`geom_point()`).
# add axis lables and plot title.
gg <- ggplot(diamonds, aes(x=carat, y=price, color=cut)) +
geom_point() +
labs(title="Scatterplot", x="Carat", y="Price")
print(gg)# add title and axis text, change legend title.
gg1 = gg + theme(
plot.title = element_text(size=30,
face="bold"),
axis.text.x = element_text(size=15),
axis.text.y = element_text(size=15),
axis.title.x = element_text(size=25),
axis.title.y = element_text(size=25)) +
scale_color_discrete(name="Cut of diamonds")
print(gg1) # print the plotscale_shape_discrete(name="legend title")scale_shape_continuous(name="legend title")# row ~ column
# columns defined by 'cut'
gg1 + facet_wrap( ~ cut, ncol=3)# row: color, column: cut
gg1 + facet_wrap(color ~ cut)# In a grid
gg1 + facet_grid(color ~ cut)# Bar Charts
# Y axis derived from counts of X item
plot1 = ggplot(mtcars, aes(x=cyl)) +
geom_bar() +
labs(title="Frequency bar chart")
print(plot1)# Y axis is explicit in the dataframe 'stat=identity'
df <- data.frame(var=c("a", "b", "c"), nums=c(1:3))
#View(df)
plot2 <- ggplot(df, aes(x=var,
y=nums)) +
geom_bar(stat = "identity")
print(plot2)# Flipping coordinates
df <- data.frame(var=c("a", "b", "c"), nums=c(1:3))
ggplot(df, aes(x=var, y=nums)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title="Coordinates are flipped")# Adjust X and Y axis limits
# coord_cartesian(xlim=c(x1,x2))
# xlim(c(x1,x2)) # delete datapoints
# scale_x_continuous(limits=c(x1,x2)) # delete datapoints
# Coord_cartesian zoomed in
ggplot(diamonds, aes(x=carat, y=price)) +
geom_point(aes(color=cut)) +
geom_smooth() +
coord_cartesian(ylim=c(0, 10000)) +
labs(title="Coord_cartesian zoomed in!")`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Datapoints deleted: change in smoothing lines
ggplot(diamonds, aes(x=carat, y=price)) +
geom_point(aes(color=cut)) +
geom_smooth() +
ylim(c(0, 10000)) +
labs(title="Datapoints deleted: Note the change in smoothing lines!")`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 5222 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 5222 rows containing missing values (`geom_point()`).
# print squares of numbers
squares = function(n) {
for(i in 1:n) {
print(i^2)
}
}
squares(5)[1] 1
[1] 4
[1] 9
[1] 16
[1] 25
# add squares of numbers
squares = function(a, b) {
return(a^2 + b^2)
}
result = squares(a=5, b=5)
print(result)[1] 50
# Lazy Evaluation of Function
lazyfn <- function(a, b) {
print(a)
print(b) # error only when it is needed
}
#lazyfn(5)# Check if Odd or Even
x = 6
if(x %% 2 == 0){
print("x is even")
}else{
print("x is odd")
}[1] "x is even"
x = 0
if (x < 0) {
print("Negative number")
} else if (x > 0) {
print("Positive number")
} else{
print("Zero")
}[1] "Zero"
for (x in 1:5) {
print(x)
}[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
days <- list("monday", "tuesday", "wednesday", "thursday", "friday")
for (x in days) {
print(x)
}[1] "monday"
[1] "tuesday"
[1] "wednesday"
[1] "thursday"
[1] "friday"
dice <- c(1, 2, 3, 4, 5, 6)
for (x in dice) {
print(x)
}[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
library(ISLR2)
Boston = ISLR2::Boston
dim(Boston)[1] 506 13
#View(Boston)
attach(Boston)
unique(Boston$rad)[1] 1 2 3 5 4 8 6 7 24
table_rad = table(Boston$rad) # freqency of unique values
df = as.data.frame(table_rad)
print(df) Var1 Freq
1 1 20
2 2 24
3 3 38
4 4 110
5 5 115
6 6 26
7 7 17
8 8 24
9 24 132
pie(x=df$Freq, labels=df$Var1, main="Pie")barplot(df$Freq, names.arg=df$Var1, main="Barplot")boxplot(age, data=Boston, main="Box Plot")hist(Boston$age, main="Histogram")plot(Boston$medv, type="l", main="Line")plot(Boston$medv, type="o", main="Line and Points")plot(x=Boston$lstat, y=Boston$medv, main="Scatter")