types of variables
# integers
x = 2L
typeof(x)
## [1] "integer"
#------------
x2 = 2
typeof(x2)
## [1] "double"
# double
y = 2.5
typeof(y)
## [1] "double"
# complex
z = 3 + 2i
typeof(z)
## [1] "complex"
# character
a = "h"
typeof(a)
## [1] "character"
# logical
q1 = T
typeof(q1)
## [1] "logical"
#---------
q2 = F
typeof(q2)
## [1] "logical"
Using Variables
A = 10
B = 5
C = A + B
print(C)
## [1] 15
# variable 1
var1 = 2.5
# variable 2
var2 = 4
result = var1/var2
result
## [1] 0.625
answer = sqrt(var2)
answer
## [1] 2
#character
greeting = "Hello"
name = "World"
message = paste(greeting, name)
message
## [1] "Hello World"
Logical Variables and Operators
# logical:
# TRUE T
# FALSE F
4 < 5
## [1] TRUE
10 > 100
## [1] FALSE
4 == 5
## [1] FALSE
#---------------
# ==
#!=
#>
#<
#>=
#<=
#!
#&
# is TRUE(x)
result = 4 < 5
result
## [1] TRUE
typeof(result)
## [1] "logical"
resultt = !TRUE
resultt
## [1] FALSE
#
result2 = !(5 > 1)
result2
## [1] FALSE
# at least one of them is T (use |)
result | result2
## [1] TRUE
# both of them are T (use &)
result & result2
## [1] FALSE
isTRUE(result)
## [1] TRUE
isTRUE(result2)
## [1] FALSE
The while loop
while (FALSE){ #logical expression
print("Hello")
}
#-------------------------------------------
# If we change FALSE TO TRUE it will type infinite Hello
counter = 1
while(counter < 12){
print(counter)
counter = counter + 1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
Using the console
x = 5
print(x)
## [1] 5
# type in console directly
using “for” loop
# a vector example
for(i in 1:5){
print("Hello R")
}
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
# a vector example
for(i in 5:10){
print("Hello R")
}
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
if statements
#---1--- -2----- -1 ----- 0 ---- 2 -----
x = rnorm(1) # generate random numbers
if(x > 1){
answer = "Greater than one"
}
x = rnorm(1) # generate random numbers
if(x > 1){
answer = "Greater than one"
} else{
if(x >= 1){
answer = "Between -1 and 1"
} else{
answer = "less than -1"
}
}
print(answer)
## [1] "less than -1"
x = rnorm(1) # generate random numbers
if(x > 1){
answer = "Greater than one"
} else if(x >= -1){
answer = "Between -1 and 1"
} else{
answer = "less than 1"
}
print(answer)
## [1] "Greater than one"
law of large numbers Xn —-> E(X) when n —-> inf LLN
# rnorm(N) default rnorm(100, mean = 0, sd = 1), so range is most about -2~2
N = 1000000
counter = 0
for(i in rnorm(N, mean = 10, sd = 5)){
if(i > -1 && i < 1){
counter = counter + 1
}
}
counter / N
## [1] 0.022288
N = 100000
counter = 0
for(i in rnorm(N)){
if(i > -0.65 && i < 0.65){
counter = counter + 1
}
}
answer = counter/N
answer
## [1] 0.48316
create some vectors
MyFirstVector = c(3, 45, 56, 732)
print(MyFirstVector)
## [1] 3 45 56 732
is.numeric(MyFirstVector)
## [1] TRUE
is.integer(MyFirstVector)
## [1] FALSE
is.double(MyFirstVector)
## [1] TRUE
V2 = c(3L, 12L, 243L, 0L)
print(V2)
## [1] 3 12 243 0
is.numeric(V2)
## [1] TRUE
is.integer(V2)
## [1] TRUE
is.double(V2)
## [1] FALSE
V3 = c("a","B23","Hello")
print(V3)
## [1] "a" "B23" "Hello"
is.numeric(V3)
## [1] FALSE
is.integer(V3)
## [1] FALSE
is.double(V3)
## [1] FALSE
V4 = c("a","B23","Hello", 6)
print(V4)
## [1] "a" "B23" "Hello" "6"
is.character(V4)
## [1] TRUE
is.numeric(V4)
## [1] FALSE
is.integer(V4)
## [1] FALSE
is.double(V4)
## [1] FALSE
#seq() #sequence
#rep() replicate
seq(1,15)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
seq(1,15,3)
## [1] 1 4 7 10 13
z = seq(1,20, 5)
print(z)
## [1] 1 6 11 16
d = rep(3, 10)
rep("bien", 3)
## [1] "bien" "bien" "bien"
x = c(80,20)
y = rep(x, 4)
y
## [1] 80 20 80 20 80 20 80 20
Using the [] brackets
x = c(1, 123, 534, 12, 4) #combine
y = seq(200, 250, 11) #sequence
z = rep("Hi", 3) #replicate
w = c("a", "b", "c", "d", "e")
print(w)
## [1] "a" "b" "c" "d" "e"
w[1]
## [1] "a"
w[2]
## [1] "b"
w[-1]
## [1] "b" "c" "d" "e"
w[-3]
## [1] "a" "b" "d" "e"
w[1:3]
## [1] "a" "b" "c"
w[4:5]
## [1] "d" "e"
w[c(1, 3, 5)]
## [1] "a" "c" "e"
w[c(-2, -4)]
## [1] "a" "c" "e"
w[-3: -5]
## [1] "a" "b"
w[1:2]
## [1] "a" "b"
Vectorized operation
a = c(2, 3, 4,5)
b = c(23, 12, 34, 44)
a + b
## [1] 25 15 38 49
a * b
## [1] 46 36 136 220
a / b
## [1] 0.08695652 0.25000000 0.11764706 0.11363636
a - b
## [1] -21 -9 -30 -39
The power of vectorized operations
x = rnorm(5)
x
## [1] -1.1468314 0.6508734 -1.4070737 0.3350163 -0.1044050
# R specific programming loop
for(i in x){
print(i)
}
## [1] -1.146831
## [1] 0.6508734
## [1] -1.407074
## [1] 0.3350163
## [1] -0.104405
# like a monkey? use loop
print(x[1])
## [1] -1.146831
print(x[2])
## [1] 0.6508734
print(x[3])
## [1] -1.407074
print(x[4])
## [1] 0.3350163
print(x[5])
## [1] -0.104405
#loop
#conventional programming loop
for(j in 1:5){
print(x[j])}
## [1] -1.146831
## [1] 0.6508734
## [1] -1.407074
## [1] 0.3350163
## [1] -0.104405
#next part
N = 100
a = rnorm(N)
b = rnorm(N)
#Vectorized approach, two doubles, just multiply
c = a * b
#De-vectorized, a bit slow, why? it is delegating, what is passing, what is needed to be down.
d = rep(NA, N)
for( i in 1:N){
d[i] = a[i] * b[i]
}
Functions in R
#rnorm()
#c()
#seq()
#rep()
#print()
#is.numeric()
#is.integer()
#is.character()
#typeof()
#sqrt()
#paste()
#?
round(rnorm(5, mean = 10, sd = 3), 2)
## [1] 6.52 10.48 8.14 9.06 13.14
round(seq(from = 10, to = 20, length.out = 20), 2)
## [1] 10.00 10.53 11.05 11.58 12.11 12.63 13.16 13.68 14.21 14.74 15.26 15.79
## [13] 16.32 16.84 17.37 17.89 18.42 18.95 19.47 20.00
x = c("a", "b", "c")
round(seq(from = 10, to = 20, along.with = x), 2)
## [1] 10 15 20
rep(5:6, each = 5)
## [1] 5 5 5 5 5 6 6 6 6 6
rep(x, each = 2)
## [1] "a" "a" "b" "b" "c" "c"
rep(x, times = 2)
## [1] "a" "b" "c" "a" "b" "c"
A = round(seq(from = 10, to = 20, along.with = x), 2)
B = sqrt(A)
print(B)
## [1] 3.162278 3.872983 4.472136
Packages in R
library(ggplot2)
qplot(data = diamonds, carat, price, color = clarity, facets = .~clarity)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
Financial Statement Analysis
revenue <- c(14574.49, 7606.46, 8611.41, 9175.41, 8058.65, 8105.44, 11496.28, 9766.09, 10305.32, 14379.96, 10713.97, 15433.50)
expenses <- c(12051.82, 5695.07, 12319.20, 12089.72, 8658.57, 840.20, 3285.73, 5821.12, 6976.93, 16618.61, 10054.37, 3803.96)
#================================================================
#Data
revenue <- c(14574.49, 7606.46, 8611.41, 9175.41, 8058.65, 8105.44, 11496.28, 9766.09, 10305.32, 14379.96, 10713.97, 15433.50)
expenses <- c(12051.82, 5695.07, 12319.20, 12089.72, 8658.57, 840.20, 3285.73, 5821.12, 6976.93, 16618.61, 10054.37, 3803.96)
#Solution
#Calculate Profit As The Differences Between Revenue And Expenses
profit <- revenue - expenses
profit
## [1] 2522.67 1911.39 -3707.79 -2914.31 -599.92 7265.24 8210.55 3944.97
## [9] 3328.39 -2238.65 659.60 11629.54
#Calculate Tax As 30% Of Profit And Round To 2 Decimal Points
tax <- round(0.30 * profit, 2)
tax
## [1] 756.80 573.42 -1112.34 -874.29 -179.98 2179.57 2463.17 1183.49
## [9] 998.52 -671.60 197.88 3488.86
#Calculate Profit Remaining After Tax Is Deducted
profit.after.tax <- profit - tax
profit.after.tax
## [1] 1765.87 1337.97 -2595.45 -2040.02 -419.94 5085.67 5747.38 2761.48
## [9] 2329.87 -1567.05 461.72 8140.68
#Calculate The Profit Margin As Profit After Tax Over Revenue
#Round To 2 Decimal Points, Then Multiply By 100 To Get %
profit.margin <- round(profit.after.tax / revenue, 2) * 100
profit.margin
## [1] 12 18 -30 -22 -5 63 50 28 23 -11 4 53
#Calculate The Mean Profit After Tax For The 12 Months
mean_pat <- mean(profit.after.tax)
mean_pat
## [1] 1750.682
#Find The Months With Above-Mean Profit After Tax
good.months <- profit.after.tax > mean_pat
good.months
## [1] TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
#Bad Months Are The Opposite Of Good Months !
bad.months <- !good.months
bad.months
## [1] FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
#The Best Month Is Where Profit After Tax Was Equal To The Maximum
best.month <- profit.after.tax == max(profit.after.tax)
best.month
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
#The Worst Month Is Where Profit After Tax Was Equal To The Minimum
worst.month <- profit.after.tax == min(profit.after.tax)
worst.month
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#Convert All Calculations To Units Of One Thousand Dollars
revenue.1000 <- round(revenue / 1000, 0)
expenses.1000 <- round(expenses / 1000, 0)
profit.1000 <- round(profit / 1000, 0)
profit.after.tax.1000 <- round(profit.after.tax / 1000, 0)
#Print Results
revenue.1000
## [1] 15 8 9 9 8 8 11 10 10 14 11 15
expenses.1000
## [1] 12 6 12 12 9 1 3 6 7 17 10 4
profit.1000
## [1] 3 2 -4 -3 -1 7 8 4 3 -2 1 12
profit.after.tax.1000
## [1] 2 1 -3 -2 0 5 6 3 2 -2 0 8
profit.margin
## [1] 12 18 -30 -22 -5 63 50 28 23 -11 4 53
good.months
## [1] TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
bad.months
## [1] FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
best.month
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
worst.month
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#BONUS:
#Preview Of What's Coming In The Next Section
M <- rbind(
revenue.1000,
expenses.1000,
profit.1000,
profit.after.tax.1000,
profit.margin,
good.months,
bad.months,
best.month,
worst.month
)
#Print The Matrix
M
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## revenue.1000 15 8 9 9 8 8 11 10 10 14 11
## expenses.1000 12 6 12 12 9 1 3 6 7 17 10
## profit.1000 3 2 -4 -3 -1 7 8 4 3 -2 1
## profit.after.tax.1000 2 1 -3 -2 0 5 6 3 2 -2 0
## profit.margin 12 18 -30 -22 -5 63 50 28 23 -11 4
## good.months 1 0 0 0 0 1 1 1 1 0 0
## bad.months 0 1 1 1 1 0 0 0 0 1 1
## best.month 0 0 0 0 0 0 0 0 0 0 0
## worst.month 0 0 1 0 0 0 0 0 0 0 0
## [,12]
## revenue.1000 15
## expenses.1000 4
## profit.1000 12
## profit.after.tax.1000 8
## profit.margin 53
## good.months 1
## bad.months 0
## best.month 1
## worst.month 0
Project Brief: Basketball Trends. Data and resources come from (https://www.superdatascience.com/pages/rcourse)
# preparation
#Dear Student,
#
#Welcome to the world of Basketball Data!
#I'm sure you will enjoy this section of the R Programming course.
#
#Instructions for this dataset:
# Simply select ALL the lines in this script by pressing
# CTRL+A on Windows or CMND+A on a Mac and execute them
# Once you have executed the commands the following objects
# will be created:
# Matrices:
# - FieldGoalAttempts
# - FieldGoals
# - Games
# - MinutesPlayed
# - Salary
# Vectors:
# - Players
# - Seasons
#We will go understand these inside the course.
#
#Sincerely,
#Kirill Eremenko
#www.superdatascience.com
#Copyright: These datasets were prepared using publicly available data.
# However, theses scripts are subject to Copyright Laws.
# If you wish to use these R scripts outside of the R Programming Course
# by Kirill Eremenko, you may do so by referencing www.superdatascience.com in your work.
#Comments:
#Seasons are labeled based on the first year in the season
#E.g. the 2012-2013 season is preseneted as simply 2012
#Notes and Corrections to the data:
#Kevin Durant: 2006 - College Data Used
#Kevin Durant: 2005 - Proxied With 2006 Data
#Derrick Rose: 2012 - Did Not Play
#Derrick Rose: 2007 - College Data Used
#Derrick Rose: 2006 - Proxied With 2007 Data
#Derrick Rose: 2005 - Proxied With 2007 Data
#Seasons
Seasons <- c("2005","2006","2007","2008","2009","2010","2011","2012","2013","2014")
#Players
Players <- c("KobeBryant","JoeJohnson","LeBronJames","CarmeloAnthony","DwightHoward","ChrisBosh","ChrisPaul","KevinDurant","DerrickRose","DwayneWade")
#Salaries
KobeBryant_Salary <- c(15946875,17718750,19490625,21262500,23034375,24806250,25244493,27849149,30453805,23500000)
JoeJohnson_Salary <- c(12000000,12744189,13488377,14232567,14976754,16324500,18038573,19752645,21466718,23180790)
LeBronJames_Salary <- c(4621800,5828090,13041250,14410581,15779912,14500000,16022500,17545000,19067500,20644400)
CarmeloAnthony_Salary <- c(3713640,4694041,13041250,14410581,15779912,17149243,18518574,19450000,22407474,22458000)
DwightHoward_Salary <- c(4493160,4806720,6061274,13758000,15202590,16647180,18091770,19536360,20513178,21436271)
ChrisBosh_Salary <- c(3348000,4235220,12455000,14410581,15779912,14500000,16022500,17545000,19067500,20644400)
ChrisPaul_Salary <- c(3144240,3380160,3615960,4574189,13520500,14940153,16359805,17779458,18668431,20068563)
KevinDurant_Salary <- c(0,0,4171200,4484040,4796880,6053663,15506632,16669630,17832627,18995624)
DerrickRose_Salary <- c(0,0,0,4822800,5184480,5546160,6993708,16402500,17632688,18862875)
DwayneWade_Salary <- c(3031920,3841443,13041250,14410581,15779912,14200000,15691000,17182000,18673000,15000000)
#Matrix
Salary <- rbind(KobeBryant_Salary, JoeJohnson_Salary, LeBronJames_Salary, CarmeloAnthony_Salary, DwightHoward_Salary, ChrisBosh_Salary, ChrisPaul_Salary, KevinDurant_Salary, DerrickRose_Salary, DwayneWade_Salary)
rm(KobeBryant_Salary, JoeJohnson_Salary, CarmeloAnthony_Salary, DwightHoward_Salary, ChrisBosh_Salary, LeBronJames_Salary, ChrisPaul_Salary, DerrickRose_Salary, DwayneWade_Salary, KevinDurant_Salary)
colnames(Salary) <- Seasons
rownames(Salary) <- Players
#Games
KobeBryant_G <- c(80,77,82,82,73,82,58,78,6,35)
JoeJohnson_G <- c(82,57,82,79,76,72,60,72,79,80)
LeBronJames_G <- c(79,78,75,81,76,79,62,76,77,69)
CarmeloAnthony_G <- c(80,65,77,66,69,77,55,67,77,40)
DwightHoward_G <- c(82,82,82,79,82,78,54,76,71,41)
ChrisBosh_G <- c(70,69,67,77,70,77,57,74,79,44)
ChrisPaul_G <- c(78,64,80,78,45,80,60,70,62,82)
KevinDurant_G <- c(35,35,80,74,82,78,66,81,81,27)
DerrickRose_G <- c(40,40,40,81,78,81,39,0,10,51)
DwayneWade_G <- c(75,51,51,79,77,76,49,69,54,62)
#Matrix
Games <- rbind(KobeBryant_G, JoeJohnson_G, LeBronJames_G, CarmeloAnthony_G, DwightHoward_G, ChrisBosh_G, ChrisPaul_G, KevinDurant_G, DerrickRose_G, DwayneWade_G)
rm(KobeBryant_G, JoeJohnson_G, CarmeloAnthony_G, DwightHoward_G, ChrisBosh_G, LeBronJames_G, ChrisPaul_G, DerrickRose_G, DwayneWade_G, KevinDurant_G)
colnames(Games) <- Seasons
rownames(Games) <- Players
#Minutes Played
KobeBryant_MP <- c(3277,3140,3192,2960,2835,2779,2232,3013,177,1207)
JoeJohnson_MP <- c(3340,2359,3343,3124,2886,2554,2127,2642,2575,2791)
LeBronJames_MP <- c(3361,3190,3027,3054,2966,3063,2326,2877,2902,2493)
CarmeloAnthony_MP <- c(2941,2486,2806,2277,2634,2751,1876,2482,2982,1428)
DwightHoward_MP <- c(3021,3023,3088,2821,2843,2935,2070,2722,2396,1223)
ChrisBosh_MP <- c(2751,2658,2425,2928,2526,2795,2007,2454,2531,1556)
ChrisPaul_MP <- c(2808,2353,3006,3002,1712,2880,2181,2335,2171,2857)
KevinDurant_MP <- c(1255,1255,2768,2885,3239,3038,2546,3119,3122,913)
DerrickRose_MP <- c(1168,1168,1168,3000,2871,3026,1375,0,311,1530)
DwayneWade_MP <- c(2892,1931,1954,3048,2792,2823,1625,2391,1775,1971)
#Matrix
MinutesPlayed <- rbind(KobeBryant_MP, JoeJohnson_MP, LeBronJames_MP, CarmeloAnthony_MP, DwightHoward_MP, ChrisBosh_MP, ChrisPaul_MP, KevinDurant_MP, DerrickRose_MP, DwayneWade_MP)
rm(KobeBryant_MP, JoeJohnson_MP, CarmeloAnthony_MP, DwightHoward_MP, ChrisBosh_MP, LeBronJames_MP, ChrisPaul_MP, DerrickRose_MP, DwayneWade_MP, KevinDurant_MP)
colnames(MinutesPlayed) <- Seasons
rownames(MinutesPlayed) <- Players
#Field Goals
KobeBryant_FG <- c(978,813,775,800,716,740,574,738,31,266)
JoeJohnson_FG <- c(632,536,647,620,635,514,423,445,462,446)
LeBronJames_FG <- c(875,772,794,789,768,758,621,765,767,624)
CarmeloAnthony_FG <- c(756,691,728,535,688,684,441,669,743,358)
DwightHoward_FG <- c(468,526,583,560,510,619,416,470,473,251)
ChrisBosh_FG <- c(549,543,507,615,600,524,393,485,492,343)
ChrisPaul_FG <- c(407,381,630,631,314,430,425,412,406,568)
KevinDurant_FG <- c(306,306,587,661,794,711,643,731,849,238)
DerrickRose_FG <- c(208,208,208,574,672,711,302,0,58,338)
DwayneWade_FG <- c(699,472,439,854,719,692,416,569,415,509)
#Matrix
FieldGoals <- rbind(KobeBryant_FG, JoeJohnson_FG, LeBronJames_FG, CarmeloAnthony_FG, DwightHoward_FG, ChrisBosh_FG, ChrisPaul_FG, KevinDurant_FG, DerrickRose_FG, DwayneWade_FG)
rm(KobeBryant_FG, JoeJohnson_FG, LeBronJames_FG, CarmeloAnthony_FG, DwightHoward_FG, ChrisBosh_FG, ChrisPaul_FG, KevinDurant_FG, DerrickRose_FG, DwayneWade_FG)
colnames(FieldGoals) <- Seasons
rownames(FieldGoals) <- Players
#Field Goal Attempts
KobeBryant_FGA <- c(2173,1757,1690,1712,1569,1639,1336,1595,73,713)
JoeJohnson_FGA <- c(1395,1139,1497,1420,1386,1161,931,1052,1018,1025)
LeBronJames_FGA <- c(1823,1621,1642,1613,1528,1485,1169,1354,1353,1279)
CarmeloAnthony_FGA <- c(1572,1453,1481,1207,1502,1503,1025,1489,1643,806)
DwightHoward_FGA <- c(881,873,974,979,834,1044,726,813,800,423)
ChrisBosh_FGA <- c(1087,1094,1027,1263,1158,1056,807,907,953,745)
ChrisPaul_FGA <- c(947,871,1291,1255,637,928,890,856,870,1170)
KevinDurant_FGA <- c(647,647,1366,1390,1668,1538,1297,1433,1688,467)
DerrickRose_FGA <- c(436,436,436,1208,1373,1597,695,0,164,835)
DwayneWade_FGA <- c(1413,962,937,1739,1511,1384,837,1093,761,1084)
#Matrix
FieldGoalAttempts <- rbind(KobeBryant_FGA, JoeJohnson_FGA, LeBronJames_FGA, CarmeloAnthony_FGA, DwightHoward_FGA, ChrisBosh_FGA, ChrisPaul_FGA, KevinDurant_FGA, DerrickRose_FGA, DwayneWade_FGA)
rm(KobeBryant_FGA, JoeJohnson_FGA, LeBronJames_FGA, CarmeloAnthony_FGA, DwightHoward_FGA, ChrisBosh_FGA, ChrisPaul_FGA, KevinDurant_FGA, DerrickRose_FGA, DwayneWade_FGA)
colnames(FieldGoalAttempts) <- Seasons
rownames(FieldGoalAttempts) <- Players
#Points
KobeBryant_PTS <- c(2832,2430,2323,2201,1970,2078,1616,2133,83,782)
JoeJohnson_PTS <- c(1653,1426,1779,1688,1619,1312,1129,1170,1245,1154)
LeBronJames_PTS <- c(2478,2132,2250,2304,2258,2111,1683,2036,2089,1743)
CarmeloAnthony_PTS <- c(2122,1881,1978,1504,1943,1970,1245,1920,2112,966)
DwightHoward_PTS <- c(1292,1443,1695,1624,1503,1784,1113,1296,1297,646)
ChrisBosh_PTS <- c(1572,1561,1496,1746,1678,1438,1025,1232,1281,928)
ChrisPaul_PTS <- c(1258,1104,1684,1781,841,1268,1189,1186,1185,1564)
KevinDurant_PTS <- c(903,903,1624,1871,2472,2161,1850,2280,2593,686)
DerrickRose_PTS <- c(597,597,597,1361,1619,2026,852,0,159,904)
DwayneWade_PTS <- c(2040,1397,1254,2386,2045,1941,1082,1463,1028,1331)
#Matrix
Points <- rbind(KobeBryant_PTS, JoeJohnson_PTS, LeBronJames_PTS, CarmeloAnthony_PTS, DwightHoward_PTS, ChrisBosh_PTS, ChrisPaul_PTS, KevinDurant_PTS, DerrickRose_PTS, DwayneWade_PTS)
rm(KobeBryant_PTS, JoeJohnson_PTS, LeBronJames_PTS, CarmeloAnthony_PTS, DwightHoward_PTS, ChrisBosh_PTS, ChrisPaul_PTS, KevinDurant_PTS, DerrickRose_PTS, DwayneWade_PTS)
colnames(Points) <- Seasons
rownames(Points) <- Players
continue
Salary
## 2005 2006 2007 2008 2009 2010 2011
## KobeBryant 15946875 17718750 19490625 21262500 23034375 24806250 25244493
## JoeJohnson 12000000 12744189 13488377 14232567 14976754 16324500 18038573
## LeBronJames 4621800 5828090 13041250 14410581 15779912 14500000 16022500
## CarmeloAnthony 3713640 4694041 13041250 14410581 15779912 17149243 18518574
## DwightHoward 4493160 4806720 6061274 13758000 15202590 16647180 18091770
## ChrisBosh 3348000 4235220 12455000 14410581 15779912 14500000 16022500
## ChrisPaul 3144240 3380160 3615960 4574189 13520500 14940153 16359805
## KevinDurant 0 0 4171200 4484040 4796880 6053663 15506632
## DerrickRose 0 0 0 4822800 5184480 5546160 6993708
## DwayneWade 3031920 3841443 13041250 14410581 15779912 14200000 15691000
## 2012 2013 2014
## KobeBryant 27849149 30453805 23500000
## JoeJohnson 19752645 21466718 23180790
## LeBronJames 17545000 19067500 20644400
## CarmeloAnthony 19450000 22407474 22458000
## DwightHoward 19536360 20513178 21436271
## ChrisBosh 17545000 19067500 20644400
## ChrisPaul 17779458 18668431 20068563
## KevinDurant 16669630 17832627 18995624
## DerrickRose 16402500 17632688 18862875
## DwayneWade 17182000 18673000 15000000
Matrices
# A[3,4] to locate the target in matrix
Building Your First Matrix
#matrix(), cbind(), rbind()
my.data = 1: 20
A = matrix(my.data, 4, 5)
A
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
A[2,3]
## [1] 10
B = matrix(my.data, 4, 5, byrow = T)
B
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 4 5
## [2,] 6 7 8 9 10
## [3,] 11 12 13 14 15
## [4,] 16 17 18 19 20
B[2,5]
## [1] 10
#rbind
r1 = c('I', 'am', 'happy')
r2 = c('What', 'a', 'day')
r3 = c(1,2,3)
C = rbind(r1, r2, r3)
C
## [,1] [,2] [,3]
## r1 "I" "am" "happy"
## r2 "What" "a" "day"
## r3 "1" "2" "3"
c1 = 1:5
c2 = -1:-5
D = cbind(c1,c2)
D
## c1 c2
## [1,] 1 -1
## [2,] 2 -2
## [3,] 3 -3
## [4,] 4 -4
## [5,] 5 -5
Naming Dimensions
# rownames() colnames(). V['rowname','colname']?
Colnames() and Rownames()
# name vectors
Charlie = 1: 5
Charlie
## [1] 1 2 3 4 5
#give name
names(Charlie) #return NULL
## NULL
names(Charlie) = c('a','b','c','d','e')
Charlie
## a b c d e
## 1 2 3 4 5
Charlie['d']
## d
## 4
names(Charlie)
## [1] "a" "b" "c" "d" "e"
#clear names
names(Charlie) = NULL
#=========================================
#naming matrix dimensions 1
rep(c('a','b','zZ'),times = 3)
## [1] "a" "b" "zZ" "a" "b" "zZ" "a" "b" "zZ"
temp.vec = rep(c('a','b','zZ'),each = 3)
temp.vec
## [1] "a" "a" "a" "b" "b" "b" "zZ" "zZ" "zZ"
Bravo = matrix(temp.vec, 3,3)
Bravo
## [,1] [,2] [,3]
## [1,] "a" "b" "zZ"
## [2,] "a" "b" "zZ"
## [3,] "a" "b" "zZ"
rownames(Bravo)# NULL
## NULL
rownames(Bravo) = c('How','are','you')
colnames(Bravo) = c('X','Y','Z')
Bravo
## X Y Z
## How "a" "b" "zZ"
## are "a" "b" "zZ"
## you "a" "b" "zZ"
#----------------
Bravo['are','Y']
## [1] "b"
#change number
Bravo['are','Y'] = 0
Bravo
## X Y Z
## How "a" "b" "zZ"
## are "a" "0" "zZ"
## you "a" "b" "zZ"
Matrix Operations
#basketball data
Games
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 80 77 82 82 73 82 58 78 6 35
## JoeJohnson 82 57 82 79 76 72 60 72 79 80
## LeBronJames 79 78 75 81 76 79 62 76 77 69
## CarmeloAnthony 80 65 77 66 69 77 55 67 77 40
## DwightHoward 82 82 82 79 82 78 54 76 71 41
## ChrisBosh 70 69 67 77 70 77 57 74 79 44
## ChrisPaul 78 64 80 78 45 80 60 70 62 82
## KevinDurant 35 35 80 74 82 78 66 81 81 27
## DerrickRose 40 40 40 81 78 81 39 0 10 51
## DwayneWade 75 51 51 79 77 76 49 69 54 62
rownames(Games)
## [1] "KobeBryant" "JoeJohnson" "LeBronJames" "CarmeloAnthony"
## [5] "DwightHoward" "ChrisBosh" "ChrisPaul" "KevinDurant"
## [9] "DerrickRose" "DwayneWade"
colnames(Games)
## [1] "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013" "2014"
Games['LeBronJames','2012']
## [1] 76
round(FieldGoals / Games, 1)
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 12.2 10.6 9.5 9.8 9.8 9.0 9.9 9.5 5.2 7.6
## JoeJohnson 7.7 9.4 7.9 7.8 8.4 7.1 7.0 6.2 5.8 5.6
## LeBronJames 11.1 9.9 10.6 9.7 10.1 9.6 10.0 10.1 10.0 9.0
## CarmeloAnthony 9.4 10.6 9.5 8.1 10.0 8.9 8.0 10.0 9.6 8.9
## DwightHoward 5.7 6.4 7.1 7.1 6.2 7.9 7.7 6.2 6.7 6.1
## ChrisBosh 7.8 7.9 7.6 8.0 8.6 6.8 6.9 6.6 6.2 7.8
## ChrisPaul 5.2 6.0 7.9 8.1 7.0 5.4 7.1 5.9 6.5 6.9
## KevinDurant 8.7 8.7 7.3 8.9 9.7 9.1 9.7 9.0 10.5 8.8
## DerrickRose 5.2 5.2 5.2 7.1 8.6 8.8 7.7 NaN 5.8 6.6
## DwayneWade 9.3 9.3 8.6 10.8 9.3 9.1 8.5 8.2 7.7 8.2
round(MinutesPlayed / Games)
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 41 41 39 36 39 34 38 39 30 34
## JoeJohnson 41 41 41 40 38 35 35 37 33 35
## LeBronJames 43 41 40 38 39 39 38 38 38 36
## CarmeloAnthony 37 38 36 34 38 36 34 37 39 36
## DwightHoward 37 37 38 36 35 38 38 36 34 30
## ChrisBosh 39 39 36 38 36 36 35 33 32 35
## ChrisPaul 36 37 38 38 38 36 36 33 35 35
## KevinDurant 36 36 35 39 40 39 39 39 39 34
## DerrickRose 29 29 29 37 37 37 35 NaN 31 30
## DwayneWade 39 38 38 39 36 37 33 35 33 32
Visualizing With Matplot()
t(FieldGoals) # flip table
## KobeBryant JoeJohnson LeBronJames CarmeloAnthony DwightHoward ChrisBosh
## 2005 978 632 875 756 468 549
## 2006 813 536 772 691 526 543
## 2007 775 647 794 728 583 507
## 2008 800 620 789 535 560 615
## 2009 716 635 768 688 510 600
## 2010 740 514 758 684 619 524
## 2011 574 423 621 441 416 393
## 2012 738 445 765 669 470 485
## 2013 31 462 767 743 473 492
## 2014 266 446 624 358 251 343
## ChrisPaul KevinDurant DerrickRose DwayneWade
## 2005 407 306 208 699
## 2006 381 306 208 472
## 2007 630 587 208 439
## 2008 631 661 574 854
## 2009 314 794 672 719
## 2010 430 711 711 692
## 2011 425 643 302 416
## 2012 412 731 0 569
## 2013 406 849 58 415
## 2014 568 238 338 509
matplot(t(Salary), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players, col = c(1:4, 6), pch = 15:18, horiz = F)
Subsetting
x = c('a','b','c','d','e')
x[c(1,5)]
## [1] "a" "e"
x[1]
## [1] "a"
#-----------
Games
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 80 77 82 82 73 82 58 78 6 35
## JoeJohnson 82 57 82 79 76 72 60 72 79 80
## LeBronJames 79 78 75 81 76 79 62 76 77 69
## CarmeloAnthony 80 65 77 66 69 77 55 67 77 40
## DwightHoward 82 82 82 79 82 78 54 76 71 41
## ChrisBosh 70 69 67 77 70 77 57 74 79 44
## ChrisPaul 78 64 80 78 45 80 60 70 62 82
## KevinDurant 35 35 80 74 82 78 66 81 81 27
## DerrickRose 40 40 40 81 78 81 39 0 10 51
## DwayneWade 75 51 51 79 77 76 49 69 54 62
Games[1:3, 6:10]
## 2010 2011 2012 2013 2014
## KobeBryant 82 58 78 6 35
## JoeJohnson 72 60 72 79 80
## LeBronJames 79 62 76 77 69
# see a specific person
Games[c(1,10),]
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 80 77 82 82 73 82 58 78 6 35
## DwayneWade 75 51 51 79 77 76 49 69 54 62
Games[,c('2008','2009')]
## 2008 2009
## KobeBryant 82 73
## JoeJohnson 79 76
## LeBronJames 81 76
## CarmeloAnthony 66 69
## DwightHoward 79 82
## ChrisBosh 77 70
## ChrisPaul 78 45
## KevinDurant 74 82
## DerrickRose 81 78
## DwayneWade 79 77
Games[1,] # no names
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 80 77 82 82 73 82 58 78 6 35
is.matrix(Games[1,])
## [1] FALSE
is.vector(Games[1,])
## [1] TRUE
Games[1,5]
## [1] 73
Games[1,,drop = F] # turn vector to matrix
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 80 77 82 82 73 82 58 78 6 35
Visualize subseting
#subset to just see top x
Data = MinutesPlayed[1:3,]
matplot(t(Data), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players[1:3], col = c(1:4, 6), pch = 15:18, horiz = F)
#subset to just see a player
data2 = MinutesPlayed[1,,drop = F]
matplot(t(data2), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players[1], col = c(1:4, 6), pch = 15:18, horiz = F)
Creating Your First Function
#how to create function for the code below?
matplot(t(data2), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players[1], col = c(1:4, 6), pch = 15:18, horiz = F)
#---------
plot = function(data, rows = 1:10){
data2 = data[rows,,drop = F]
matplot(t(data2), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players[rows], col = c(1:4, 6), pch = 15:18, horiz = F)
}
#type data and rows below
plot(MinutesPlayed/Games, 1)
Basketball Insights
#salary
plot(Salary)
plot(Salary / Games)
plot(Salary / FieldGoals)
# In-game matrics
plot(MinutesPlayed)
plot(Points)
# In-game matrices normalized
plot(FieldGoals / Games)
plot(FieldGoals / FieldGoalAttempts)
plot(FieldGoalAttempts/Games)
plot(Points/Games)
# interesting observation
plot(MinutesPlayed/Games)
plot(Games)
# time is valuable
plot(FieldGoals/MinutesPlayed)
#player style
plot(Points/FieldGoals)
Homework chapter 4
#Dear Student,
#
#Welcome to the dataset for the homework exercise.
#
#Instructions for this dataset:
# You have only been supplied vectors. You will need
# to create the matrices yourself.
# Matrices:
# - FreeThrows
# - FreeThrowAttempts
#
#Sincerely,
#Kirill Eremenko
#www.superdatascience.com
#Copyright: These datasets were prepared using publicly available data.
# However, theses scripts are subject to Copyright Laws.
# If you wish to use these R scripts outside of the R Programming Course
# by Kirill Eremenko, you may do so by referencing www.superdatascience.com in your work.
#Comments:
#Seasons are labeled based on the first year in the season
#E.g. the 2012-2013 season is preseneted as simply 2012
#Notes and Corrections to the data:
#Kevin Durant: 2006 - College Data Used
#Kevin Durant: 2005 - Proxied With 2006 Data
#Derrick Rose: 2012 - Did Not Play
#Derrick Rose: 2007 - College Data Used
#Derrick Rose: 2006 - Proxied With 2007 Data
#Derrick Rose: 2005 - Proxied With 2007 Data
#Seasons
Seasons <- c("2005","2006","2007","2008","2009","2010","2011","2012","2013","2014")
#Players
Players <- c("KobeBryant","JoeJohnson","LeBronJames","CarmeloAnthony","DwightHoward","ChrisBosh","ChrisPaul","KevinDurant","DerrickRose","DwayneWade")
#Free Throws
KobeBryant_FT <- c(696,667,623,483,439,483,381,525,18,196)
JoeJohnson_FT <- c(261,235,316,299,220,195,158,132,159,141)
LeBronJames_FT <- c(601,489,549,594,593,503,387,403,439,375)
CarmeloAnthony_FT <- c(573,459,464,371,508,507,295,425,459,189)
DwightHoward_FT <- c(356,390,529,504,483,546,281,355,349,143)
ChrisBosh_FT <- c(474,463,472,504,470,384,229,241,223,179)
ChrisPaul_FT <- c(394,292,332,455,161,337,260,286,295,289)
KevinDurant_FT <- c(209,209,391,452,756,594,431,679,703,146)
DerrickRose_FT <- c(146,146,146,197,259,476,194,0,27,152)
DwayneWade_FT <- c(629,432,354,590,534,494,235,308,189,284)
#Matrix
#
# <put your code here>
#
#Free Throw Attempts
KobeBryant_FTA <- c(819,768,742,564,541,583,451,626,21,241)
JoeJohnson_FTA <- c(330,314,379,362,269,243,186,161,195,176)
LeBronJames_FTA <- c(814,701,771,762,773,663,502,535,585,528)
CarmeloAnthony_FTA <- c(709,568,590,468,612,605,367,512,541,237)
DwightHoward_FTA <- c(598,666,897,849,816,916,572,721,638,271)
ChrisBosh_FTA <- c(581,590,559,617,590,471,279,302,272,232)
ChrisPaul_FTA <- c(465,357,390,524,190,384,302,323,345,321)
KevinDurant_FTA <- c(256,256,448,524,840,675,501,750,805,171)
DerrickRose_FTA <- c(205,205,205,250,338,555,239,0,32,187)
DwayneWade_FTA <- c(803,535,467,771,702,652,297,425,258,370)
#Matrix
#
#
#
#Matrix for Free Throws
#Bind the given vectors to form the matrix
FreeThrows <- rbind(KobeBryant_FT, JoeJohnson_FT, LeBronJames_FT, CarmeloAnthony_FT, DwightHoward_FT, ChrisBosh_FT, ChrisPaul_FT, KevinDurant_FT, DerrickRose_FT, DwayneWade_FT)
#Remove vectors - we don't need them anymore
rm(KobeBryant_FT, JoeJohnson_FT, CarmeloAnthony_FT, DwightHoward_FT, ChrisBosh_FT, LeBronJames_FT, ChrisPaul_FT, DerrickRose_FT, DwayneWade_FT, KevinDurant_FT)
#Rename the columns
colnames(FreeThrows) = Seasons
#Rename the rows
rownames(FreeThrows) = Players
#Check the matrix
FreeThrows
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 696 667 623 483 439 483 381 525 18 196
## JoeJohnson 261 235 316 299 220 195 158 132 159 141
## LeBronJames 601 489 549 594 593 503 387 403 439 375
## CarmeloAnthony 573 459 464 371 508 507 295 425 459 189
## DwightHoward 356 390 529 504 483 546 281 355 349 143
## ChrisBosh 474 463 472 504 470 384 229 241 223 179
## ChrisPaul 394 292 332 455 161 337 260 286 295 289
## KevinDurant 209 209 391 452 756 594 431 679 703 146
## DerrickRose 146 146 146 197 259 476 194 0 27 152
## DwayneWade 629 432 354 590 534 494 235 308 189 284
#Matrix for Free Throw Attempts
#Bind the given vectors to form the matrix
FreeThrowAttempts <- rbind(KobeBryant_FTA, JoeJohnson_FTA, LeBronJames_FTA, CarmeloAnthony_FTA, DwightHoward_FTA, ChrisBosh_FTA, ChrisPaul_FTA, KevinDurant_FTA, DerrickRose_FTA, DwayneWade_FTA)
#Remove vectors - we don't need them anymore
rm(KobeBryant_FTA, JoeJohnson_FTA, CarmeloAnthony_FTA, DwightHoward_FTA, ChrisBosh_FTA, LeBronJames_FTA, ChrisPaul_FTA, DerrickRose_FTA, DwayneWade_FTA, KevinDurant_FTA)
#Rename the columns
colnames(FreeThrowAttempts) <- Seasons
#Rename the rows
rownames(FreeThrowAttempts) <- Players
#Check the matrix
FreeThrowAttempts
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant 819 768 742 564 541 583 451 626 21 241
## JoeJohnson 330 314 379 362 269 243 186 161 195 176
## LeBronJames 814 701 771 762 773 663 502 535 585 528
## CarmeloAnthony 709 568 590 468 612 605 367 512 541 237
## DwightHoward 598 666 897 849 816 916 572 721 638 271
## ChrisBosh 581 590 559 617 590 471 279 302 272 232
## ChrisPaul 465 357 390 524 190 384 302 323 345 321
## KevinDurant 256 256 448 524 840 675 501 750 805 171
## DerrickRose 205 205 205 250 338 555 239 0 32 187
## DwayneWade 803 535 467 771 702 652 297 425 258 370
#Re-create the plotting function
myplot <- function(z, who=1:10) {
matplot(t(z[who,,drop=F]), type="b", pch=15:18, col=c(1:4,6), main="Basketball Players Analysis")
legend("bottomleft", inset=0.01, legend=Players[who], col=c(1:4,6), pch=15:18, horiz=F)
}
#Visualize the new matrices
myplot(FreeThrows)
myplot(FreeThrowAttempts)
#Part 1 - Free Throw Attempts Per Game
#(You will need the Games matrix)
myplot(FreeThrowAttempts/Games)
#Notice how Chris Paul gets few attempts per game
#Part 2 - Free Throw Accuracy
myplot(FreeThrows/FreeThrowAttempts)
#And yet Chris Paul's accuracy is one of the highest
#Chances are his team would get more points if he had more FTA's
#Also notice that Dwight Howard's FT Accuracy is extremely poor
#compared to other players. If you recall, Dwight Howard's
#Field Goal Accuracy was exceptional:
myplot(FieldGoals/FieldGoalAttempts)
#How could this be? Why is there such a drastic difference?
#We will see just now...
#Part 3 - Player Style Patterns Excluding Free Throws
myplot((Points-FreeThrows)/FieldGoals)
#Because we have excluded free throws, this plot now shows us
#the true representation of player style change. We can verify
#that this is the case because all the marks without exception
#on this plot are between 2 and 3. That is because Field Goals
#can only be for either 2 points or 3 points.
#Insights:
#1. You can see how players' preference for 2 or 3 point shots
# changes throughout their career. We can see that almost all
# players in this dataset experiment with their style throughout
# their careers. Perhaps, the most drastic change in style has
# been experienced by Joe Johnson.
#2. There is one exception. You can see that one player has not
# changed his style at all - almost always scoring only 2-pointers.
# Who is this mystert player? It's Dwight Howard!
# Now that explains a lot. The reason that Dwight Howard's
# Field Goal accuracy is so good is because he almost always
# scores 2-pointers only. That means he can be close to the basket
# or even in contact with it. Free throws, on the other hand require
# the player to stand 15ft (4.57m) away from the hoop. That's
# probably why Dwight Howard's Free Throw Accuracy is poor.
brief: demographic analysis
#get data from ... like before
Importing data into R
# get data
#1. select the file manually
# stats = read.csv(file.choose())
# head(stats)
#2. set WD and read data
getwd()
## [1] "/home/jupyter-yangbdm/new folder/R-program"
#windows:
setwd("/home/jupyter-yangbdm/new folder/R-program")
# factor some character variables
stats = read.csv("P2-Demographic-Data.csv", stringsAsFactors = T)
Exploring your dataset
nrow(stats)
## [1] 195
ncol(stats)
## [1] 5
head(stats, n = 5)
tail(stats, n = 5)
#--------------------
str(stats) # str = structure. runif()run operation if it is working
## 'data.frame': 195 obs. of 5 variables:
## $ Country.Name : Factor w/ 195 levels "Afghanistan",..: 8 1 4 2 183 6 7 5 9 10 ...
## $ Country.Code : Factor w/ 195 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Birth.rate : num 10.2 35.3 46 12.9 11 ...
## $ Internet.users: num 78.9 5.9 19.1 57.2 88 ...
## $ Income.Group : Factor w/ 4 levels "High income",..: 1 2 4 4 1 1 3 1 1 1 ...
summary(stats)
## Country.Name Country.Code Birth.rate Internet.users
## Afghanistan : 1 ABW : 1 Min. : 7.90 Min. : 0.90
## Albania : 1 AFG : 1 1st Qu.:12.12 1st Qu.:14.52
## Algeria : 1 AGO : 1 Median :19.68 Median :41.00
## Angola : 1 ALB : 1 Mean :21.47 Mean :42.08
## Antigua and Barbuda: 1 ARE : 1 3rd Qu.:29.76 3rd Qu.:66.22
## Argentina : 1 ARG : 1 Max. :49.66 Max. :96.55
## (Other) :189 (Other):189
## Income.Group
## High income :67
## Low income :30
## Lower middle income:50
## Upper middle income:48
##
##
##
Using the $ sign
head(stats)
stats[3,3]
## [1] 45.985
stats[3, "Birth.rate"]
## [1] 45.985
stats[5, "Country.Name"]
## [1] United Arab Emirates
## 195 Levels: Afghanistan Albania Algeria Angola Antigua and Barbuda ... Zimbabwe
#--------------------------
head(stats[, "Internet.users"])
## [1] 78.9 5.9 19.1 57.2 88.0 59.9
head(stats$Internet.users)
## [1] 78.9 5.9 19.1 57.2 88.0 59.9
stats$Internet.users[2]
## [1] 5.9
#--------------------------
levels(stats$Income.Group)
## [1] "High income" "Low income" "Lower middle income"
## [4] "Upper middle income"
Basic operations with a Data Frame
stats[1:10,]
stats[3:9,]
#how [] works
stats[1,]
is.data.frame(stats[1,])
## [1] TRUE
is.data.frame(stats[,1])
## [1] FALSE
is.data.frame(stats[,1,drop = F])
## [1] TRUE
# multiply columns
stats$Birth.rate * stats$Internet.users
## [1] 808.2516 207.9927 878.3135 736.5644 971.8720 1061.1884 557.6052
## [8] 1042.7398 1095.6000 757.8167 1074.2100 57.3963 920.3062 178.5560
## [15] 369.0141 133.5415 488.1658 1353.6006 1104.4080 523.6930 677.1250
## [22] 775.8912 991.1200 895.2778 762.0782 889.7240 1058.1225 542.2066
## [29] 379.0050 119.2660 935.2200 880.6680 890.1025 554.1800 313.4880
## [36] 238.3104 244.2726 831.1292 223.1190 810.9375 690.4111 290.4720
## [43] 926.2500 748.5411 755.9261 715.4450 242.1170 946.2970 972.9882
## [50] 408.1770 850.2521 824.1408 31.3200 651.8785 817.8200 62.5575
## [57] 979.2041 759.1773 1007.6135 653.6058 281.1060 1096.0980 577.2756
## [64] 407.5113 59.7392 595.3500 116.2593 579.9368 508.8636 676.6900
## [71] 954.1000 541.0605 1137.2406 660.9750 586.1800 384.3554 627.4274
## [78] 268.6570 668.3239 303.2372 306.3941 1173.7155 536.1050 286.0556
## [85] 1293.7271 1508.0400 496.9040 502.3340 1108.8860 735.6220 1227.4200
## [92] 1372.5660 625.6000 166.3416 334.0060 729.0220 1552.5895 338.1375
## [99] 946.5330 113.6672 353.5125 712.8660 862.9600 391.1997 143.6900
## [106] 691.3743 1059.6744 767.3909 740.6448 1177.2880 546.3450 104.0580
## [113] 945.8127 830.2598 732.1233 154.4830 654.6811 28.9904 700.5610
## [120] 485.5000 214.4070 209.5662 425.1000 199.2680 1125.4308 416.1243
## [127] 1122.0000 84.4237 1521.7100 322.2140 958.3553 1102.6194 278.2759
## [134] 1086.0736 1356.8426 322.4438 866.5104 791.7616 880.2300 187.8435
## [141] 603.3523 798.1200 490.5552 796.5972 931.1224 1018.4820 437.9276
## [148] 897.2040 294.2010 1244.8480 759.9279 504.7823 753.3000 244.6240
## [155] 62.4393 403.8581 65.8365 473.8000 523.4766 794.3510 690.2170
## [162] 786.6143 741.2911 1118.4465 743.2971 937.4400 629.9266 105.2135
## [169] 162.3600 319.5265 492.6720 204.6912 39.3305 889.3150 930.8420
## [176] 867.2400 778.6650 173.8792 704.2788 455.1000 829.2361 1052.5000
## [183] 859.5000 847.9120 1089.3258 484.7100 682.0743 302.1507 1416.3604
## [190] 400.4316 658.9400 969.5250 93.2668 623.2534 660.7275
stats$Birth.rate + stats$Internet.users
## [1] 89.14400 41.15300 65.08500 70.07700 99.04400 77.61600 55.20800
## [8] 79.84700 96.20000 90.01880 77.00000 45.45100 93.37020 41.34000
## [15] 49.65100 26.77200 62.26150 105.04004 87.33900 66.85200 66.67000
## [22] 56.69200 105.70000 61.17600 65.97100 85.18800 80.90500 48.03400
## [29] 40.26700 37.57600 96.70000 96.54000 79.88500 57.90000 45.72000
## [36] 43.63600 43.61100 67.77600 40.82600 59.12500 60.98200 38.33000
## [43] 86.60000 76.89080 84.31040 92.67000 34.98600 104.62970 67.09800
## [50] 41.23800 61.42368 57.43200 35.70000 80.73500 89.70000 34.82500
## [57] 102.21440 57.56300 94.21980 51.31100 39.75500 102.04410 56.63200
## [64] 45.43100 38.93700 56.52500 40.60300 51.76200 68.36630 54.33400
## [71] 80.30000 47.16500 82.78900 53.88500 82.10000 39.39300 76.14760
## [78] 35.94500 81.84390 35.23700 35.39100 93.24770 47.85000 40.29300
## [85] 109.94680 92.10000 66.95930 50.64000 68.04600 97.91000 76.73000
## [92] 74.19400 50.20000 31.26200 40.54400 93.37000 96.03500 39.55100
## [99] 83.92600 38.72100 37.92500 61.63000 103.00000 39.76300 33.73800
## [106] 78.55290 105.07650 85.43440 77.05600 77.02300 57.14100 37.68600
## [113] 65.54700 62.56400 76.46200 47.63800 78.41380 19.71900 71.92600
## [120] 44.27500 45.10500 40.00100 49.90000 44.50900 83.77500 43.83700
## [127] 83.00000 51.36100 78.04500 36.28800 104.15640 106.65340 34.22300
## [134] 95.90000 86.86900 40.48200 63.71000 59.39800 60.79000 35.39900
## [141] 72.44920 84.70000 69.99560 58.48800 73.19300 97.24000 58.56450
## [148] 81.17000 41.68900 81.07600 56.17700 51.63300 90.30000 38.57800
## [155] 38.42900 40.58530 45.39100 60.70000 51.22600 57.53700 55.85500
## [162] 87.98260 82.87560 106.58360 54.79300 69.00000 50.24300 48.04500
## [169] 40.58000 39.98100 46.79200 30.92200 36.85500 60.40900 78.39000
## [176] 63.60000 63.08600 43.91800 59.67400 52.10000 72.06400 96.70000
## [183] 60.70000 68.30600 74.74200 56.00000 59.43700 38.03900 76.99400
## [190] 41.47200 52.94700 67.35000 44.59400 55.87100 54.21500
head(stats)
stats$mycal = stats$Birth.rate * stats$Internet.users
head(stats)
#add column
stats$xyz = 1:5 # not work if not fit the columns replacement has 4 rows, data has 195
head(stats, n = 12)
#remove col
stats$mycal = NULL
stats$xyz = NULL
head(stats)
Filtering a Data Frame
head(stats)
filter = stats$Internet.users < 2 # compare each number with 2
stats[filter, ] #get country that number of internet users is low
stats[stats$Birth.rate > 40,]
stats[stats$Birth.rate > 40 & stats$Internet.users < 2,] # birth > 40, internet < 2
stats[stats$Income.Group == "High income",]
levels(stats$Income.Group)
## [1] "High income" "Low income" "Lower middle income"
## [4] "Upper middle income"
#
stats[stats$Country.Name == "Malta",]
n = "United States"
stats[stats$Country.Name == n,]
Introduction to qplot
library(ggplot2)
qplot(data = stats, x = Internet.users, )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(data = stats, x = Income.Group, y = Birth.rate, size = I(2)) #I increase size circle
#---------------------------
qplot(data = stats,
x = Income.Group,
y = Birth.rate,
size = I(2),
color = I("blue"))
#boxplot
qplot(data = stats,
x = Income.Group,
y = Birth.rate,
geom = "boxplot")
Visualizing With Qplot: Part I
# visual
options(warn=-1)
qplot(data = stats,
x = Internet.users,
y = Birth.rate
)
# visual2
options(warn=-1)
qplot(data = stats,
x = Internet.users,
y = Birth.rate,
size = I(.75),
color = I("red"))
# visual3 low income more birthrate, low internet.
# high income, low birthrate, high internet.
options(warn=-1)
qplot(data = stats,
x = Internet.users,
y = Birth.rate,
size = I(2.0),
color = Income.Group)
Building Dataframes
#Execute below code to generate three new vectors
Countries_2012_Dataset <- c("Aruba","Afghanistan","Angola","Albania","United Arab Emirates","Argentina","Armenia","Antigua and Barbuda","Australia","Austria","Azerbaijan","Burundi","Belgium","Benin","Burkina Faso","Bangladesh","Bulgaria","Bahrain","Bahamas, The","Bosnia and Herzegovina","Belarus","Belize","Bermuda","Bolivia","Brazil","Barbados","Brunei Darussalam","Bhutan","Botswana","Central African Republic","Canada","Switzerland","Chile","China","Cote d'Ivoire","Cameroon","Congo, Rep.","Colombia","Comoros","Cabo Verde","Costa Rica","Cuba","Cayman Islands","Cyprus","Czech Republic","Germany","Djibouti","Denmark","Dominican Republic","Algeria","Ecuador","Egypt, Arab Rep.","Eritrea","Spain","Estonia","Ethiopia","Finland","Fiji","France","Micronesia, Fed. Sts.","Gabon","United Kingdom","Georgia","Ghana","Guinea","Gambia, The","Guinea-Bissau","Equatorial Guinea","Greece","Grenada","Greenland","Guatemala","Guam","Guyana","Hong Kong SAR, China","Honduras","Croatia","Haiti","Hungary","Indonesia","India","Ireland","Iran, Islamic Rep.","Iraq","Iceland","Israel","Italy","Jamaica","Jordan","Japan","Kazakhstan","Kenya","Kyrgyz Republic","Cambodia","Kiribati","Korea, Rep.","Kuwait","Lao PDR","Lebanon","Liberia","Libya","St. Lucia","Liechtenstein","Sri Lanka","Lesotho","Lithuania","Luxembourg","Latvia","Macao SAR, China","Morocco","Moldova","Madagascar","Maldives","Mexico","Macedonia, FYR","Mali","Malta","Myanmar","Montenegro","Mongolia","Mozambique","Mauritania","Mauritius","Malawi","Malaysia","Namibia","New Caledonia","Niger","Nigeria","Nicaragua","Netherlands","Norway","Nepal","New Zealand","Oman","Pakistan","Panama","Peru","Philippines","Papua New Guinea","Poland","Puerto Rico","Portugal","Paraguay","French Polynesia","Qatar","Romania","Russian Federation","Rwanda","Saudi Arabia","Sudan","Senegal","Singapore","Solomon Islands","Sierra Leone","El Salvador","Somalia","Serbia","South Sudan","Sao Tome and Principe","Suriname","Slovak Republic","Slovenia","Sweden","Swaziland","Seychelles","Syrian Arab Republic","Chad","Togo","Thailand","Tajikistan","Turkmenistan","Timor-Leste","Tonga","Trinidad and Tobago","Tunisia","Turkey","Tanzania","Uganda","Ukraine","Uruguay","United States","Uzbekistan","St. Vincent and the Grenadines","Venezuela, RB","Virgin Islands (U.S.)","Vietnam","Vanuatu","West Bank and Gaza","Samoa","Yemen, Rep.","South Africa","Congo, Dem. Rep.","Zambia","Zimbabwe")
Codes_2012_Dataset <- c("ABW","AFG","AGO","ALB","ARE","ARG","ARM","ATG","AUS","AUT","AZE","BDI","BEL","BEN","BFA","BGD","BGR","BHR","BHS","BIH","BLR","BLZ","BMU","BOL","BRA","BRB","BRN","BTN","BWA","CAF","CAN","CHE","CHL","CHN","CIV","CMR","COG","COL","COM","CPV","CRI","CUB","CYM","CYP","CZE","DEU","DJI","DNK","DOM","DZA","ECU","EGY","ERI","ESP","EST","ETH","FIN","FJI","FRA","FSM","GAB","GBR","GEO","GHA","GIN","GMB","GNB","GNQ","GRC","GRD","GRL","GTM","GUM","GUY","HKG","HND","HRV","HTI","HUN","IDN","IND","IRL","IRN","IRQ","ISL","ISR","ITA","JAM","JOR","JPN","KAZ","KEN","KGZ","KHM","KIR","KOR","KWT","LAO","LBN","LBR","LBY","LCA","LIE","LKA","LSO","LTU","LUX","LVA","MAC","MAR","MDA","MDG","MDV","MEX","MKD","MLI","MLT","MMR","MNE","MNG","MOZ","MRT","MUS","MWI","MYS","NAM","NCL","NER","NGA","NIC","NLD","NOR","NPL","NZL","OMN","PAK","PAN","PER","PHL","PNG","POL","PRI","PRT","PRY","PYF","QAT","ROU","RUS","RWA","SAU","SDN","SEN","SGP","SLB","SLE","SLV","SOM","SRB","SSD","STP","SUR","SVK","SVN","SWE","SWZ","SYC","SYR","TCD","TGO","THA","TJK","TKM","TLS","TON","TTO","TUN","TUR","TZA","UGA","UKR","URY","USA","UZB","VCT","VEN","VIR","VNM","VUT","PSE","WSM","YEM","ZAF","COD","ZMB","ZWE")
Regions_2012_Dataset <- c("The Americas","Asia","Africa","Europe","Middle East","The Americas","Asia","The Americas","Oceania","Europe","Asia","Africa","Europe","Africa","Africa","Asia","Europe","Middle East","The Americas","Europe","Europe","The Americas","The Americas","The Americas","The Americas","The Americas","Asia","Asia","Africa","Africa","The Americas","Europe","The Americas","Asia","Africa","Africa","Africa","The Americas","Africa","Africa","The Americas","The Americas","The Americas","Europe","Europe","Europe","Africa","Europe","The Americas","Africa","The Americas","Africa","Africa","Europe","Europe","Africa","Europe","Oceania","Europe","Oceania","Africa","Europe","Asia","Africa","Africa","Africa","Africa","Africa","Europe","The Americas","The Americas","The Americas","Oceania","The Americas","Asia","The Americas","Europe","The Americas","Europe","Asia","Asia","Europe","Middle East","Middle East","Europe","Middle East","Europe","The Americas","Middle East","Asia","Asia","Africa","Asia","Asia","Oceania","Asia","Middle East","Asia","Middle East","Africa","Africa","The Americas","Europe","Asia","Africa","Europe","Europe","Europe","Asia","Africa","Europe","Africa","Asia","The Americas","Europe","Africa","Europe","Asia","Europe","Asia","Africa","Africa","Africa","Africa","Asia","Africa","Oceania","Africa","Africa","The Americas","Europe","Europe","Asia","Oceania","Middle East","Asia","The Americas","The Americas","Asia","Oceania","Europe","The Americas","Europe","The Americas","Oceania","Middle East","Europe","Europe","Africa","Middle East","Africa","Africa","Asia","Oceania","Africa","The Americas","Africa","Europe","Africa","Africa","The Americas","Europe","Europe","Europe","Africa","Africa","Middle East","Africa","Africa","Asia","Asia","Asia","Asia","Oceania","The Americas","Africa","Europe","Africa","Africa","Europe","The Americas","The Americas","Asia","The Americas","The Americas","The Americas","Asia","Oceania","Middle East","Oceania","Middle East","Africa","Africa","Africa","Africa")
#References:
#(c) Kirill Eremenko, www.superdatascience.com
continue
# create a data frame
df = data.frame(Countries_2012_Dataset,
Codes_2012_Dataset,
Regions_2012_Dataset)
colnames(df) = c("Country","Code","Region")
head(df)
rm(df)
#new way to renames
df = data.frame(Country = Countries_2012_Dataset,
Code = Codes_2012_Dataset,
Region = Regions_2012_Dataset, stringsAsFactors = T)
head(df)
summary(df)
## Country Code Region
## Afghanistan : 1 ABW : 1 Africa :54
## Albania : 1 AFG : 1 Asia :33
## Algeria : 1 AGO : 1 Europe :42
## Angola : 1 ALB : 1 Middle East :14
## Antigua and Barbuda: 1 ARE : 1 Oceania :13
## Argentina : 1 ARG : 1 The Americas:39
## (Other) :189 (Other):189
Merging Data Frames
# merge data.frame
head(stats)
head(df)
merged = merge(stats, df, by.x = 'Country.Code', by.y = 'Code') # left join?
head(merged)
merged$Country = NULL
str(merged)
## 'data.frame': 195 obs. of 6 variables:
## $ Country.Code : Factor w/ 195 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.Name : Factor w/ 195 levels "Afghanistan",..: 8 1 4 2 183 6 7 5 9 10 ...
## $ Birth.rate : num 10.2 35.3 46 12.9 11 ...
## $ Internet.users: num 78.9 5.9 19.1 57.2 88 ...
## $ Income.Group : Factor w/ 4 levels "High income",..: 1 2 4 4 1 1 3 1 1 1 ...
## $ Region : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
tail(merged)
Visualizing With Qplot: Part II
# visual with new split
qplot(data = merged, x = Internet.users, y = Birth.rate)
qplot(data = merged, x = Internet.users, y = Birth.rate, color = Region)
# make it nicer,
# change shapes
qplot(data = merged,
x = Internet.users,
y = Birth.rate,
color = Region,
size = I(2),
shape = I(25))
pch = 0,square pch = 1,circle pch = 2,triangle point up pch = 3,plus pch = 4,cross pch = 5,diamond pch = 6,triangle point down pch = 7,square cross pch = 8,star pch = 9,diamond plus pch = 10,circle plus pch = 11,triangles up and down pch = 12,square plus pch = 13,circle cross pch = 14,square and triangle down pch = 15, filled square pch = 16, filled circle pch = 17, filled triangle point-up pch = 18, filled diamond pch = 19, solid circle pch = 20,bullet (smaller circle) pch = 21, filled circle blue pch = 22, filled square blue pch = 23, filled diamond blue pch = 24, filled triangle point-up blue pch = 25, filled triangle point down blue
# transparency
qplot(data = merged,
x = Internet.users,
y = Birth.rate,
color = Region,
size = I(2),
shape = I(19),
alpha = I(0.51)) # use alpha to change transparency 0~1.
# add title
qplot(data = merged,
x = Internet.users,
y = Birth.rate,
color = Region,
size = I(2),
shape = I(19),
alpha = I(0.51),
main = "Birth rate vs Internet Users")
HOMEWORK: World Trends
#Execute below code to generate three new vectors
Country_Code <- c("ABW","AFG","AGO","ALB","ARE","ARG","ARM","ATG","AUS","AUT","AZE","BDI","BEL","BEN","BFA","BGD","BGR","BHR","BHS","BIH","BLR","BLZ","BOL","BRA","BRB","BRN","BTN","BWA","CAF","CAN","CHE","CHL","CHN","CIV","CMR","COG","COL","COM","CPV","CRI","CUB","CYP","CZE","DEU","DJI","DNK","DOM","DZA","ECU","EGY","ERI","ESP","EST","ETH","FIN","FJI","FRA","FSM","GAB","GBR","GEO","GHA","GIN","GMB","GNB","GNQ","GRC","GRD","GTM","GUM","GUY","HKG","HND","HRV","HTI","HUN","IDN","IND","IRL","IRN","IRQ","ISL","ITA","JAM","JOR","JPN","KAZ","KEN","KGZ","KHM","KIR","KOR","KWT","LAO","LBN","LBR","LBY","LCA","LKA","LSO","LTU","LUX","LVA","MAC","MAR","MDA","MDG","MDV","MEX","MKD","MLI","MLT","MMR","MNE","MNG","MOZ","MRT","MUS","MWI","MYS","NAM","NCL","NER","NGA","NIC","NLD","NOR","NPL","NZL","OMN","PAK","PAN","PER","PHL","PNG","POL","PRI","PRT","PRY","PYF","QAT","ROU","RUS","RWA","SAU","SDN","SEN","SGP","SLB","SLE","SLV","SOM","SSD","STP","SUR","SVK","SVN","SWE","SWZ","SYR","TCD","TGO","THA","TJK","TKM","TLS","TON","TTO","TUN","TUR","TZA","UGA","UKR","URY","USA","UZB","VCT","VEN","VIR","VNM","VUT","WSM","YEM","ZAF","COD","ZMB","ZWE")
Life_Expectancy_At_Birth_1960 <- c(65.5693658536586,32.328512195122,32.9848292682927,62.2543658536585,52.2432195121951,65.2155365853659,65.8634634146342,61.7827317073171,70.8170731707317,68.5856097560976,60.836243902439,41.2360487804878,69.7019512195122,37.2782682926829,34.4779024390244,45.8293170731707,69.2475609756098,52.0893658536585,62.7290487804878,60.2762195121951,67.7080975609756,59.9613658536585,42.1183170731707,54.2054634146342,60.7380487804878,62.5003658536585,32.3593658536585,50.5477317073171,36.4826341463415,71.1331707317073,71.3134146341463,57.4582926829268,43.4658048780488,36.8724146341463,41.523756097561,48.5816341463415,56.716756097561,41.4424390243903,48.8564146341463,60.5761951219512,63.9046585365854,69.5939268292683,70.3487804878049,69.3129512195122,44.0212682926829,72.1765853658537,51.8452682926829,46.1351219512195,53.215,48.0137073170732,37.3629024390244,69.1092682926829,67.9059756097561,38.4057073170732,68.819756097561,55.9584878048781,69.8682926829268,57.5865853658537,39.5701219512195,71.1268292682927,63.4318536585366,45.8314634146342,34.8863902439024,32.0422195121951,37.8404390243902,36.7330487804878,68.1639024390244,59.8159268292683,45.5316341463415,61.2263414634146,60.2787317073171,66.9997073170732,46.2883170731707,64.6086585365854,42.1000975609756,68.0031707317073,48.6403170731707,41.1719512195122,69.691756097561,44.945512195122,48.0306829268293,73.4286585365854,69.1239024390244,64.1918292682927,52.6852682926829,67.6660975609756,58.3675853658537,46.3624146341463,56.1280731707317,41.2320243902439,49.2159756097561,53.0013170731707,60.3479512195122,43.2044634146342,63.2801219512195,34.7831707317073,42.6411951219512,57.303756097561,59.7471463414634,46.5107073170732,69.8473170731707,68.4463902439024,69.7868292682927,64.6609268292683,48.4466341463415,61.8127804878049,39.9746829268293,37.2686341463415,57.0656341463415,60.6228048780488,28.2116097560976,67.6017804878049,42.7363902439024,63.7056097560976,48.3688048780488,35.0037073170732,43.4830975609756,58.7452195121951,37.7736341463415,59.4753414634146,46.8803902439024,58.6390243902439,35.5150487804878,37.1829512195122,46.9988292682927,73.3926829268293,73.549756097561,35.1708292682927,71.2365853658537,42.6670731707317,45.2904634146342,60.8817073170732,47.6915853658537,57.8119268292683,38.462243902439,67.6804878048781,68.7196097560976,62.8089268292683,63.7937073170732,56.3570487804878,61.2060731707317,65.6424390243903,66.0552926829268,42.2492926829268,45.6662682926829,48.1876341463415,38.206,65.6598292682927,49.3817073170732,30.3315365853659,49.9479268292683,36.9658780487805,31.6767073170732,50.4513658536585,59.6801219512195,69.9759268292683,68.9780487804878,73.0056097560976,44.2337804878049,52.768243902439,38.0161219512195,40.2728292682927,54.6993170731707,56.1535365853659,54.4586829268293,33.7271219512195,61.3645365853659,62.6575853658537,42.009756097561,45.3844146341463,43.6538780487805,43.9835609756098,68.2995365853659,67.8963902439025,69.7707317073171,58.8855365853659,57.7238780487805,59.2851219512195,63.7302195121951,59.0670243902439,46.4874878048781,49.969512195122,34.3638048780488,49.0362926829268,41.0180487804878,45.1098048780488,51.5424634146342)
Life_Expectancy_At_Birth_2013 <- c(75.3286585365854,60.0282682926829,51.8661707317073,77.537243902439,77.1956341463415,75.9860975609756,74.5613658536585,75.7786585365854,82.1975609756098,80.890243902439,70.6931463414634,56.2516097560976,80.3853658536585,59.3120243902439,58.2406341463415,71.245243902439,74.4658536585366,76.5459512195122,75.0735365853659,76.2769268292683,72.4707317073171,69.9820487804878,67.9134390243903,74.1224390243903,75.3339512195122,78.5466585365854,69.1029268292683,64.3608048780488,49.8798780487805,81.4011219512195,82.7487804878049,81.1979268292683,75.3530243902439,51.2084634146342,55.0418048780488,61.6663902439024,73.8097317073171,62.9321707317073,72.9723658536585,79.2252195121951,79.2563902439025,79.9497804878049,78.2780487804878,81.0439024390244,61.6864634146342,80.3024390243903,73.3199024390244,74.5689512195122,75.648512195122,70.9257804878049,63.1778780487805,82.4268292682927,76.4243902439025,63.4421951219512,80.8317073170732,69.9179268292683,81.9682926829268,68.9733902439024,63.8435853658537,80.9560975609756,74.079512195122,61.1420731707317,58.216487804878,59.9992682926829,54.8384146341464,57.2908292682927,80.6341463414634,73.1935609756098,71.4863902439024,78.872512195122,66.3100243902439,83.8317073170732,72.9428536585366,77.1268292682927,62.4011463414634,75.2682926829268,68.7046097560976,67.6604146341463,81.0439024390244,75.1259756097561,69.4716829268293,83.1170731707317,82.290243902439,73.4689268292683,73.9014146341463,83.3319512195122,70.45,60.9537804878049,70.2024390243902,67.7720487804878,65.7665853658537,81.459756097561,74.462756097561,65.687243902439,80.1288780487805,60.5203902439024,71.6576829268293,74.9127073170732,74.2402926829268,49.3314634146342,74.1634146341464,81.7975609756098,73.9804878048781,80.3391463414634,73.7090487804878,68.811512195122,64.6739024390244,76.6026097560976,76.5326585365854,75.1870487804878,57.5351951219512,80.7463414634146,65.6540975609756,74.7583658536585,69.0618048780488,54.641512195122,62.8027073170732,74.46,61.466,74.567512195122,64.3438780487805,77.1219512195122,60.8281463414634,52.4421463414634,74.514756097561,81.1048780487805,81.4512195121951,69.222,81.4073170731707,76.8410487804878,65.9636829268293,77.4192195121951,74.2838536585366,68.1315609756097,62.4491707317073,76.8487804878049,78.7111951219512,80.3731707317073,72.7991707317073,76.3340731707317,78.4184878048781,74.4634146341463,71.0731707317073,63.3948292682927,74.1776341463415,63.1670487804878,65.878756097561,82.3463414634146,67.7189268292683,50.3631219512195,72.4981463414634,55.0230243902439,55.2209024390244,66.259512195122,70.99,76.2609756097561,80.2780487804878,81.7048780487805,48.9379268292683,74.7157804878049,51.1914878048781,59.1323658536585,74.2469268292683,69.4001707317073,65.4565609756098,67.5223658536585,72.6403414634147,70.3052926829268,73.6463414634147,75.1759512195122,64.2918292682927,57.7676829268293,71.159512195122,76.8361951219512,78.8414634146341,68.2275853658537,72.8108780487805,74.0744146341464,79.6243902439024,75.756487804878,71.669243902439,73.2503902439024,63.583512195122,56.7365853658537,58.2719268292683,59.2373658536585,55.633)
#References:
#(c) Kirill Eremenko, www.superdatascience.com
continue
#Set the Working Directory
getwd()
## [1] "/home/jupyter-yangbdm/new folder/R-program"
setwd("/home/jupyter-yangbdm/new folder/R-program")
getwd()
## [1] "/home/jupyter-yangbdm/new folder/R-program"
#Import the csv data set
data <- read.csv("P2-Section5-Homework-Data.csv", stringsAsFactors = T)
#Explore the data
head(data, n = 6) #check top 6 rows
tail(data, n=7) #check bottom 7 rows
str(data) #check the structure of the data frame
## 'data.frame': 374 obs. of 5 variables:
## $ Country.Name : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
## $ Country.Code : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Region : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
## $ Year : int 1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
## $ Fertility.Rate: num 4.82 7.45 7.38 6.19 6.93 ...
summary(data) #check the summary of the data
## Country.Name Country.Code Region Year
## Afghanistan : 2 ABW : 2 Africa :106 Min. :1960
## Albania : 2 AFG : 2 Asia : 66 1st Qu.:1960
## Algeria : 2 AGO : 2 Europe : 80 Median :1986
## Angola : 2 ALB : 2 Middle East : 24 Mean :1986
## Antigua and Barbuda: 2 ARE : 2 Oceania : 26 3rd Qu.:2013
## Argentina : 2 ARG : 2 The Americas: 72 Max. :2013
## (Other) :362 (Other):362
## Fertility.Rate
## Min. :1.124
## 1st Qu.:2.243
## Median :3.994
## Mean :4.191
## 3rd Qu.:6.252
## Max. :8.187
##
#Did you pick up that there is more than one year in the data?
#From the challenge we know that there are two: 1960 and 2013
#Filter the dataframes
data1960 <- data[data$Year==1960,]
data2013 <- data[data$Year==2013,]
#Check row counts
nrow(data1960) #187 rows
## [1] 187
nrow(data2013) #187 rows. Equal split.
## [1] 187
#Create the additional dataframes
add1960 <- data.frame(Code=Country_Code, Life.Exp=Life_Expectancy_At_Birth_1960)
add2013 <- data.frame(Code=Country_Code, Life.Exp=Life_Expectancy_At_Birth_2013)
#Check summaries
summary(add1960)
## Code Life.Exp
## Length:187 Min. :28.21
## Class :character 1st Qu.:43.47
## Mode :character Median :54.70
## Mean :53.73
## 3rd Qu.:64.05
## Max. :73.55
summary(add2013)
## Code Life.Exp
## Length:187 Min. :48.94
## Class :character 1st Qu.:64.52
## Mode :character Median :73.25
## Mean :70.76
## 3rd Qu.:76.84
## Max. :83.83
#Merge the pairs of dataframes
merged1960 <- merge(data1960, add1960, by.x="Country.Code", by.y="Code")
merged2013 <- merge(data2013, add2013, by.x="Country.Code", by.y="Code")
#Check the new structures
str(merged1960)
## 'data.frame': 187 obs. of 6 variables:
## $ Country.Code : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.Name : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
## $ Region : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
## $ Year : int 1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
## $ Fertility.Rate: num 4.82 7.45 7.38 6.19 6.93 ...
## $ Life.Exp : num 65.6 32.3 33 62.3 52.2 ...
str(merged2013)
## 'data.frame': 187 obs. of 6 variables:
## $ Country.Code : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.Name : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
## $ Region : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
## $ Year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ Fertility.Rate: num 1.67 5.05 6.17 1.77 1.8 ...
## $ Life.Exp : num 75.3 60 51.9 77.5 77.2 ...
#We can see an obsolete column in each of the merged dataframes
#Column "Year" is no longer required. Let's remove it
merged1960$Year <- NULL
merged2013$Year <- NULL
#Check structures again
str(merged1960)
## 'data.frame': 187 obs. of 5 variables:
## $ Country.Code : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.Name : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
## $ Region : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
## $ Fertility.Rate: num 4.82 7.45 7.38 6.19 6.93 ...
## $ Life.Exp : num 65.6 32.3 33 62.3 52.2 ...
str(merged2013)
## 'data.frame': 187 obs. of 5 variables:
## $ Country.Code : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Country.Name : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
## $ Region : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
## $ Fertility.Rate: num 1.67 5.05 6.17 1.77 1.8 ...
## $ Life.Exp : num 75.3 60 51.9 77.5 77.2 ...
#Visualization time
library(ggplot2)
options(warn=-1)
#Visualize the 1960 dataset
qplot(data=merged1960, x=Fertility.Rate, y=Life.Exp,
color=Region, #colour
size=I(2),
alpha=I(0.6), #transparency
main="Life Expectancy vs Fertility (1960)" #title
)
#Visualize the 2013 dataset
qplot(data=merged2013, x=Fertility.Rate, y=Life.Exp,
color=Region, #colour
size=I(2),
alpha=I(0.6), #transparency
main="Life Expectancy vs Fertility (1960)" #title
)
Project Brief: Movie Ratings
# Movie Ratings introduction, no code
Grammar Of Graphics - GGPlot2
# Grammar introduction, no code
# DATA: movie name, budget, genre
# Aesthetics: x axis, y axis, color of columns.
#Geometries: statistics, histograms group rows by genre or other variables.
#facet: ~, separated charts
#coordinate: x y chart.
# Theme: title: label, size of pictures, subtitles.
What is a Factor?
movies = read.csv("P2-Movie-Ratings.csv", stringsAsFactors = T)
head(movies)
colnames(movies) = c("Film",'Genre','CriticRating','AudienceRating','BudgetMillions','Year')
str(movies)
## 'data.frame': 562 obs. of 6 variables:
## $ Film : Factor w/ 562 levels "(500) Days of Summer ",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : Factor w/ 7 levels "Action","Adventure",..: 3 2 1 2 3 1 3 5 3 3 ...
## $ CriticRating : int 87 9 30 93 55 39 40 50 43 93 ...
## $ AudienceRating: int 81 44 52 84 70 63 71 57 48 93 ...
## $ BudgetMillions: int 8 105 20 18 20 200 30 32 28 8 ...
## $ Year : int 2009 2008 2009 2010 2009 2009 2008 2007 2011 2011 ...
summary(movies)
## Film Genre CriticRating AudienceRating
## (500) Days of Summer : 1 Action :154 Min. : 0.0 Min. : 0.00
## 10,000 B.C. : 1 Adventure: 29 1st Qu.:25.0 1st Qu.:47.00
## 12 Rounds : 1 Comedy :172 Median :46.0 Median :58.00
## 127 Hours : 1 Drama :101 Mean :47.4 Mean :58.83
## 17 Again : 1 Horror : 49 3rd Qu.:70.0 3rd Qu.:72.00
## 2012 : 1 Romance : 21 Max. :97.0 Max. :96.00
## (Other) :556 Thriller : 36
## BudgetMillions Year
## Min. : 0.0 Min. :2007
## 1st Qu.: 20.0 1st Qu.:2008
## Median : 35.0 Median :2009
## Mean : 50.1 Mean :2009
## 3rd Qu.: 65.0 3rd Qu.:2010
## Max. :300.0 Max. :2011
##
# factor(movies$Year)
movies$Year = factor(movies$Year)
str(movies)
## 'data.frame': 562 obs. of 6 variables:
## $ Film : Factor w/ 562 levels "(500) Days of Summer ",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Genre : Factor w/ 7 levels "Action","Adventure",..: 3 2 1 2 3 1 3 5 3 3 ...
## $ CriticRating : int 87 9 30 93 55 39 40 50 43 93 ...
## $ AudienceRating: int 81 44 52 84 70 63 71 57 48 93 ...
## $ BudgetMillions: int 8 105 20 18 20 200 30 32 28 8 ...
## $ Year : Factor w/ 5 levels "2007","2008",..: 3 2 3 4 3 3 2 1 5 5 ...
Aesthetics
# Aesthetics
library(ggplot2)
ggplot(data = movies, aes(x = CriticRating, y = AudienceRating,
color = Genre, #color
size = BudgetMillions))+ # size
geom_point()
Plotting With Layers
# Geometries, create an object
p = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating,
color = Genre,
size = BudgetMillions))
#points
p + geom_point()
#line (not good)
p+geom_line()
# multiple layers (still not good)
p + geom_line()+geom_point()
Overriding Aesthetics
q = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating,
color = Genre,
size = BudgetMillions))
#add geom layers
q + geom_point()
# overriding aesthetics
#ex1
q + geom_point(aes(size = CriticRating))
#ex2
q + geom_point(aes(color = BudgetMillions))
#ex3 (inappropriate)
q + geom_point(aes(x = BudgetMillions)) + xlab("Budget Millions $")
#ex4 reduce line size
q + geom_line(size = 1) + geom_point()
Mapping vs Setting
r = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating,
color = Genre,
size = BudgetMillions))
# add color
#1 mapping
r + geom_point(aes(color = Genre))
#2. setting
r + geom_point(color = 'Darkgreen')
# ERROR
# r + geom_point(aes(color = 'Darkgreen'))
# Mapping
r + geom_point(aes(size = BudgetMillions))
# Setting
r + geom_point(size = 3)
#error
# r + geom_point(aes(size = 3))
Histograms and Density Charts
s = ggplot(data = movies, aes(x = BudgetMillions))
s + geom_histogram(binwidth = 5)
# add color
s + geom_histogram(binwidth = 5, aes(fill = Genre))
#add border
s + geom_histogram(binwidth = 5, aes(fill = Genre), color = "Black")
# density chart?
s + geom_density(aes(fill = Genre))
s + geom_density(aes(fill = Genre), position = "stack")
Starting Layer Tips
t = ggplot(data = movies, aes(x = AudienceRating))
t + geom_histogram(binwidth = 5, fill = 'White', color = 'Blue')
#another way
t = ggplot(data = movies)
t + geom_histogram(binwidth = 5,
aes(x = AudienceRating), fill = 'White', color = 'Blue')
#4
t + geom_histogram(binwidth = 5,
aes(x = CriticRating), fill = 'White', color = 'Blue')
# 5
t = ggplot() # create a plot with different dataset
Statistical Transformations
u = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating,
color = Genre))
u + geom_point() + geom_smooth(fill = NA)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#boxplot
u = ggplot(data = movies, aes(x = Genre, y = AudienceRating, color = Genre))
u + geom_boxplot(size = 1) + geom_point()
#tip/something else
u + geom_boxplot(size = 1) + geom_jitter()
u + geom_jitter() + geom_boxplot(size = 1, alpha = 0.5)
Using Facets
v = ggplot(data = movies, aes (x = BudgetMillions))
v + geom_histogram(binwidth = 5, aes(fill = Genre), color = 'Black')
# facets
v + geom_histogram(binwidth = 5, aes(fill = Genre), color = 'Black') +
facet_grid(Genre ~ ., scales = "free")
#scatterplots
w = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating, color = Genre))
w + geom_point(size = 2)
# add facets
w + geom_point(size = 1) +
facet_grid(Genre ~.)
#year
# add facets
w + geom_point(size = 1) +
facet_grid(.~Year)
# add facets and year
w + geom_point(size = 1) +
facet_grid(Genre~Year)
options(warn=-1)
# add facets and year
w + geom_point(size = 1) +
geom_smooth() +
facet_grid(Genre~Year)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# add facets and year with Budget millions
w + geom_point(aes(size = BudgetMillions)) +
geom_smooth() +
facet_grid(Genre~Year)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Coordinates
# how to zoom in and out
m = ggplot(data = movies, aes(x = CriticRating,
y = AudienceRating,
color = Genre,
size = BudgetMillions))
m + geom_point() +
xlim(50, 100) + #zoom in
ylim(50, 100)
# zoom for columns
n = ggplot(data = movies, aes(x = BudgetMillions))
n + geom_histogram(binwidth = 15, aes(fill = Genre),
color = 'Black')+
coord_cartesian(ylim = c(0,50))
# improve add facets and year with Budget millions
w + geom_point(aes(size = BudgetMillions)) +
geom_smooth() +
facet_grid(Genre~Year) +
coord_cartesian(ylim = c(0,100))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Perfecting By Adding Themes
o = ggplot(data = movies, aes(x = BudgetMillions))
o + geom_histogram(binwidth = 10, aes(fill = Genre), color = 'Black')
# add labels or axes labels
h = o + geom_histogram(binwidth = 10, aes(fill = Genre), color = 'Black')
h +
xlab('Money Axis') +
ylab('Number of Movies') +
theme(axis.title.x = element_text(color = "DarkGreen",size = 10),
axis.title.y = element_text(color = "Red",size = 10),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
# legend format
h +
xlab('Money Axis') +
ylab('Number of Movies') +
theme(axis.title.x = element_text(color = "DarkGreen",size = 10),
axis.title.y = element_text(color = "Red",size = 10),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
legend.title = element_text(size = 15),
legend.text = element_text(size = 15),
legend.position = c(1,1),
legend.justification = c(1,1))
# TITLE
# legend format
h +
xlab('Money Axis') +
ylab('Number of Movies') +
ggtitle('Movie Budget Distribution')+
theme(axis.title.x = element_text(color = "DarkGreen",size = 10),
axis.title.y = element_text(color = "Red",size = 10),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
legend.title = element_text(size = 15),
legend.text = element_text(size = 15),
legend.position = c(1,1),
legend.justification = c(1,1),
plot.title = element_text(color = 'Darkblue',
size = 20,
family = 'Courier'))
chapter 6 Homework
mov <- read.csv("Section6-Homework-Data.csv", stringsAsFactors = T)
#Data Exploration
nrow(mov) #top rows
## [1] 608
ncol(mov) #column summaries
## [1] 18
str(mov) #structure of the dataset
## 'data.frame': 608 obs. of 18 variables:
## $ Day.of.Week : Factor w/ 6 levels "Friday","Saturday",..: 1 1 1 1 1 1 4 1 1 1 ...
## $ Director : Factor w/ 337 levels "Aaron Blaise, Robert A. Walker",..: 31 297 233 256 287 76 276 71 108 126 ...
## $ Genre : Factor w/ 15 levels "action","adventure",..: 1 1 1 5 1 1 2 1 1 10 ...
## $ Movie.Title : Factor w/ 608 levels "10,000 B.C.",..: 557 314 466 6 592 161 233 378 128 331 ...
## $ Release.Date : Factor w/ 534 levels "1/05/2009","1/05/2015",..: 273 86 121 134 384 159 347 16 28 257 ...
## $ Studio : Factor w/ 36 levels "Art House Studios",..: 2 2 11 25 25 25 2 31 31 20 ...
## $ Adjusted.Gross...mill.: Factor w/ 585 levels "1,003","1,020",..: 50 51 52 53 54 55 56 57 58 59 ...
## $ Budget...mill. : num 170 66 100 42 150 80 50 85 70 5 ...
## $ Gross...mill. : Factor w/ 561 levels "1,004.60","1,017",..: 30 33 43 27 40 59 63 49 72 45 ...
## $ IMDb.Rating : num 6.7 6.6 6.1 7.2 8 5.8 6 6.8 6.3 5.9 ...
## $ MovieLens.Rating : num 3.26 2.97 2.93 3.62 3.65 2.85 3.16 3.45 2.92 2.9 ...
## $ Overseas...mill. : Factor w/ 551 levels "1,160.60","1,528.10",..: 32 151 172 490 82 66 528 523 150 11 ...
## $ Overseas. : num 55.4 78.6 80.9 31.3 64.4 59.5 39.9 39.3 73.9 49.8 ...
## $ Profit...mill. : Factor w/ 566 levels "1,015.40","1,025.90",..: 366 47 13 94 494 39 100 28 69 189 ...
## $ Profit. : num 18.9 208 106.2 380 36.9 ...
## $ Runtime..min. : int 130 132 126 109 131 134 125 115 92 84 ...
## $ US...mill. : num 90.2 43.6 39.3 138.4 73.1 ...
## $ Gross...US : num 44.6 21.4 19.1 68.7 35.6 40.5 60.1 60.7 26.1 50.2 ...
#Activate GGPlot2
#install.packages("ggplot2")
library(ggplot2)
#{Offtopic} This Is A Cool Insight:
#Notice? No movies are released on a Monday. Ever.
ggplot(data=mov, aes(x=Day.of.Week)) + geom_bar()
#Now we need to filter our dataset to leave onlly the
#Genres and Studios that we are interested in
#We will start with the Genre filter and use the Logical 'OR'
#operator to select multiple Genres:
filt <- (mov$Genre == "action") | (mov$Genre == "adventure") | (mov$Genre == "animation") | (mov$Genre == "comedy") | (mov$Genre == "drama")
#Now let's do the same for the Studio filter:
filt2 <- mov$Studio %in% c("Buena Vista Studios","WB","Fox","Universal","Sony", "Paramount Pictures")
#Apply the row filters to the dataframe
mov2 <- mov[filt & filt2,]
(mov2)
#Prepare the plot's data and aes layers
#Note we did not rename the columns.
#Use str() or summary() to fin out the correct column names
p <- ggplot(data=mov2, aes(x=Genre, y=Gross...US))
# p #Nothing happens. We need a geom.
#Add a Point Geom Layer
p +
geom_point()
#Add a boxplot instead of the points
p +
geom_boxplot()
#Notice that outliers are part of the boxplot layer
#We will use this observation later (*)
#Add points
q = p +
geom_jitter(aes(size = Budget...mill., color = Studio)) +
geom_boxplot(alpha = 0.5, outlier.color = NA)
q
# non-data ink
q = q +
xlab('Genre') +
ylab('Gross % US') +
ggtitle('Domestic Gross %')
q
# add theme
q = q +
theme(
axis.title.x = element_text(color = 'Blue', size = 10),
axis.title.y = element_text(color = 'Blue', size = 10),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(size = 15),
legend.title = element_text(size = 15),
legend.text = element_text(size = 15),
text = element_text(family = 'Times New Roman') # change text style
)
q
# final touch
q$labels$size = 'Budget $M'
q