Udemy: R Programming Practices

2 Programing principles

types of variables

# integers
x = 2L
typeof(x)

## [1] "integer"

#------------
x2 = 2
typeof(x2)

## [1] "double"

# double
y = 2.5
typeof(y)

## [1] "double"

# complex
z = 3 + 2i
typeof(z)

## [1] "complex"

# character
a = "h"
typeof(a)

## [1] "character"

# logical
q1 = T
typeof(q1)

## [1] "logical"

#---------
q2 = F
typeof(q2)

## [1] "logical"

Using Variables

A = 10
B = 5
C = A + B
print(C)

## [1] 15

# variable 1
var1 = 2.5
# variable 2
var2 = 4
result = var1/var2
result

## [1] 0.625

answer = sqrt(var2)
answer

## [1] 2

#character
greeting = "Hello"
name = "World"
message = paste(greeting, name)
message

## [1] "Hello World"

Logical Variables and Operators

# logical:
# TRUE T
# FALSE F

4 < 5

## [1] TRUE

10 > 100

## [1] FALSE

4 == 5

## [1] FALSE

#---------------
# ==
#!=
#>
#<
#>=
#<=
#!
#&
# is TRUE(x)

result = 4 < 5
result

## [1] TRUE

typeof(result)

## [1] "logical"

resultt = !TRUE
resultt

## [1] FALSE

#
result2 = !(5 > 1)
result2

## [1] FALSE

# at least one of them is T (use |)
result | result2

## [1] TRUE

# both of them are T (use &)
result & result2

## [1] FALSE

isTRUE(result)

## [1] TRUE

isTRUE(result2)

## [1] FALSE

The while loop

while (FALSE){ #logical expression
  print("Hello")
}
#-------------------------------------------
# If we change FALSE TO TRUE it will type infinite Hello

counter = 1
while(counter < 12){
  print(counter)
  counter = counter + 1
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11

Using the console

x = 5
print(x)

## [1] 5

# type in console directly

using “for” loop

# a vector example
for(i in 1:5){
  print("Hello R")
}

## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"

# a vector example
for(i in 5:10){
  print("Hello R")
}

## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"
## [1] "Hello R"

if statements

#---1--- -2----- -1 ----- 0 ---- 2 -----

x = rnorm(1) # generate random numbers
if(x > 1){
  answer = "Greater than one"
}

x = rnorm(1) # generate random numbers
if(x > 1){
  answer = "Greater than one"
} else{
  if(x >= 1){
    answer = "Between -1 and 1"
  } else{
    answer = "less than -1"
  }
}
print(answer)

## [1] "less than -1"

x = rnorm(1) # generate random numbers
if(x > 1){
  answer = "Greater than one"
} else if(x >= -1){
  answer = "Between -1 and 1"
} else{
  answer = "less than 1"
}
print(answer)

## [1] "Greater than one"

law of large numbers Xn —-> E(X) when n —-> inf LLN

# rnorm(N) default rnorm(100, mean = 0, sd = 1), so range is most about -2~2
N = 1000000
counter = 0
for(i in rnorm(N, mean = 10, sd = 5)){
  if(i > -1 && i < 1){
    counter = counter + 1
    
  }
}
counter / N

## [1] 0.022288

N = 100000
counter = 0
for(i in rnorm(N)){
  if(i > -0.65 && i < 0.65){
    counter = counter + 1
  }
}
answer = counter/N
answer

## [1] 0.48316

3 fundamentals of R

create some vectors

MyFirstVector = c(3, 45, 56, 732)
print(MyFirstVector)

## [1]   3  45  56 732

is.numeric(MyFirstVector)

## [1] TRUE

is.integer(MyFirstVector)

## [1] FALSE

is.double(MyFirstVector)

## [1] TRUE

V2 = c(3L, 12L, 243L, 0L)
print(V2)

## [1]   3  12 243   0

is.numeric(V2)

## [1] TRUE

is.integer(V2)

## [1] TRUE

is.double(V2)

## [1] FALSE

V3 = c("a","B23","Hello")
print(V3)

## [1] "a"     "B23"   "Hello"

is.numeric(V3)

## [1] FALSE

is.integer(V3)

## [1] FALSE

is.double(V3)

## [1] FALSE

V4 = c("a","B23","Hello", 6)
print(V4)

## [1] "a"     "B23"   "Hello" "6"

is.character(V4)

## [1] TRUE

is.numeric(V4)

## [1] FALSE

is.integer(V4)

## [1] FALSE

is.double(V4)

## [1] FALSE

#seq() #sequence
#rep() replicate
seq(1,15)

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15

seq(1,15,3)

## [1]  1  4  7 10 13

z = seq(1,20, 5)
print(z)

## [1]  1  6 11 16

d = rep(3, 10)
rep("bien", 3)

## [1] "bien" "bien" "bien"

x = c(80,20)
y = rep(x, 4)
y

## [1] 80 20 80 20 80 20 80 20

Using the [] brackets

x = c(1, 123, 534, 12, 4) #combine
y = seq(200, 250, 11)     #sequence
z = rep("Hi", 3)          #replicate

w = c("a", "b", "c", "d", "e")
print(w)

## [1] "a" "b" "c" "d" "e"

w[1]

## [1] "a"

w[2]

## [1] "b"

w[-1]

## [1] "b" "c" "d" "e"

w[-3]

## [1] "a" "b" "d" "e"

w[1:3]

## [1] "a" "b" "c"

w[4:5]

## [1] "d" "e"

w[c(1, 3, 5)]

## [1] "a" "c" "e"

w[c(-2, -4)]

## [1] "a" "c" "e"

w[-3: -5]

## [1] "a" "b"

w[1:2]

## [1] "a" "b"

Vectorized operation

a = c(2, 3, 4,5)
b = c(23, 12, 34, 44)
a + b

## [1] 25 15 38 49

a * b

## [1]  46  36 136 220

a / b

## [1] 0.08695652 0.25000000 0.11764706 0.11363636

a - b

## [1] -21  -9 -30 -39

The power of vectorized operations

x = rnorm(5)
x

## [1] -1.1468314  0.6508734 -1.4070737  0.3350163 -0.1044050

# R specific programming loop
for(i in x){
  print(i)
}

## [1] -1.146831
## [1] 0.6508734
## [1] -1.407074
## [1] 0.3350163
## [1] -0.104405

# like a monkey? use loop
print(x[1])

## [1] -1.146831

print(x[2])

## [1] 0.6508734

print(x[3])

## [1] -1.407074

print(x[4])

## [1] 0.3350163

print(x[5])

## [1] -0.104405

#loop
#conventional programming loop
for(j in 1:5){
print(x[j])}

## [1] -1.146831
## [1] 0.6508734
## [1] -1.407074
## [1] 0.3350163
## [1] -0.104405

#next part
N = 100
a = rnorm(N)
b = rnorm(N)

#Vectorized approach, two doubles, just multiply
c = a * b

#De-vectorized, a bit slow, why? it is delegating, what is passing, what is needed to be down.
d = rep(NA, N)
for( i in 1:N){
  d[i] = a[i] * b[i]
}

Functions in R

#rnorm()
#c()
#seq()
#rep()
#print()
#is.numeric()
#is.integer()
#is.character()
#typeof()
#sqrt()
#paste()

#?
round(rnorm(5, mean = 10, sd = 3), 2)

## [1]  6.52 10.48  8.14  9.06 13.14

round(seq(from = 10, to = 20, length.out = 20), 2)

##  [1] 10.00 10.53 11.05 11.58 12.11 12.63 13.16 13.68 14.21 14.74 15.26 15.79
## [13] 16.32 16.84 17.37 17.89 18.42 18.95 19.47 20.00

x = c("a", "b", "c")
round(seq(from = 10, to = 20, along.with = x), 2)

## [1] 10 15 20

rep(5:6, each = 5)

##  [1] 5 5 5 5 5 6 6 6 6 6

rep(x, each = 2)

## [1] "a" "a" "b" "b" "c" "c"

rep(x, times = 2)

## [1] "a" "b" "c" "a" "b" "c"

A = round(seq(from = 10, to = 20, along.with = x), 2)
B = sqrt(A)
print(B)

## [1] 3.162278 3.872983 4.472136

Packages in R

library(ggplot2)
qplot(data = diamonds, carat, price, color = clarity, facets = .~clarity)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

Financial Statement Analysis

revenue <- c(14574.49, 7606.46, 8611.41, 9175.41, 8058.65, 8105.44, 11496.28, 9766.09, 10305.32, 14379.96, 10713.97, 15433.50)
expenses <- c(12051.82, 5695.07, 12319.20, 12089.72, 8658.57, 840.20, 3285.73, 5821.12, 6976.93, 16618.61, 10054.37, 3803.96)
#================================================================
#Data
revenue <- c(14574.49, 7606.46, 8611.41, 9175.41, 8058.65, 8105.44, 11496.28, 9766.09, 10305.32, 14379.96, 10713.97, 15433.50)
expenses <- c(12051.82, 5695.07, 12319.20, 12089.72, 8658.57, 840.20, 3285.73, 5821.12, 6976.93, 16618.61, 10054.37, 3803.96)

#Solution
#Calculate Profit As The Differences Between Revenue And Expenses
profit <- revenue - expenses
profit

##  [1]  2522.67  1911.39 -3707.79 -2914.31  -599.92  7265.24  8210.55  3944.97
##  [9]  3328.39 -2238.65   659.60 11629.54

#Calculate Tax As 30% Of Profit And Round To 2 Decimal Points
tax <- round(0.30 * profit, 2)
tax

##  [1]   756.80   573.42 -1112.34  -874.29  -179.98  2179.57  2463.17  1183.49
##  [9]   998.52  -671.60   197.88  3488.86

#Calculate Profit Remaining After Tax Is Deducted
profit.after.tax <- profit - tax
profit.after.tax

##  [1]  1765.87  1337.97 -2595.45 -2040.02  -419.94  5085.67  5747.38  2761.48
##  [9]  2329.87 -1567.05   461.72  8140.68

#Calculate The Profit Margin As Profit After Tax Over Revenue
#Round To 2 Decimal Points, Then Multiply By 100 To Get %
profit.margin <- round(profit.after.tax / revenue, 2) * 100
profit.margin

##  [1]  12  18 -30 -22  -5  63  50  28  23 -11   4  53

#Calculate The Mean Profit After Tax For The 12 Months
mean_pat <- mean(profit.after.tax)
mean_pat

## [1] 1750.682

#Find The Months With Above-Mean Profit After Tax
good.months <- profit.after.tax > mean_pat
good.months

##  [1]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE

#Bad Months Are The Opposite Of Good Months !
bad.months <- !good.months
bad.months

##  [1] FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE

#The Best Month Is Where Profit After Tax Was Equal To The Maximum
best.month <- profit.after.tax == max(profit.after.tax)
best.month

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE

#The Worst Month Is Where Profit After Tax Was Equal To The Minimum
worst.month <- profit.after.tax == min(profit.after.tax)
worst.month

##  [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

#Convert All Calculations To Units Of One Thousand Dollars
revenue.1000 <- round(revenue / 1000, 0)
expenses.1000 <- round(expenses / 1000, 0)
profit.1000 <- round(profit / 1000, 0)
profit.after.tax.1000 <- round(profit.after.tax / 1000, 0)

#Print Results
revenue.1000

##  [1] 15  8  9  9  8  8 11 10 10 14 11 15

expenses.1000

##  [1] 12  6 12 12  9  1  3  6  7 17 10  4

profit.1000

##  [1]  3  2 -4 -3 -1  7  8  4  3 -2  1 12

profit.after.tax.1000

##  [1]  2  1 -3 -2  0  5  6  3  2 -2  0  8

profit.margin

##  [1]  12  18 -30 -22  -5  63  50  28  23 -11   4  53

good.months

##  [1]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE

bad.months

##  [1] FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE

best.month

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE

worst.month

##  [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

#BONUS:
#Preview Of What's Coming In The Next Section
M <- rbind(
  revenue.1000,
  expenses.1000,
  profit.1000,
  profit.after.tax.1000,
  profit.margin,
  good.months,
  bad.months,
  best.month,
  worst.month
)

#Print The Matrix
M

##                       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11]
## revenue.1000            15    8    9    9    8    8   11   10   10    14    11
## expenses.1000           12    6   12   12    9    1    3    6    7    17    10
## profit.1000              3    2   -4   -3   -1    7    8    4    3    -2     1
## profit.after.tax.1000    2    1   -3   -2    0    5    6    3    2    -2     0
## profit.margin           12   18  -30  -22   -5   63   50   28   23   -11     4
## good.months              1    0    0    0    0    1    1    1    1     0     0
## bad.months               0    1    1    1    1    0    0    0    0     1     1
## best.month               0    0    0    0    0    0    0    0    0     0     0
## worst.month              0    0    1    0    0    0    0    0    0     0     0
##                       [,12]
## revenue.1000             15
## expenses.1000             4
## profit.1000              12
## profit.after.tax.1000     8
## profit.margin            53
## good.months               1
## bad.months                0
## best.month                1
## worst.month               0

4 Matrices

Project Brief: Basketball Trends. Data and resources come from (https://www.superdatascience.com/pages/rcourse)

# preparation
#Dear Student,
#
#Welcome to the world of Basketball Data!
#I'm sure you will enjoy this section of the R Programming course.
#
#Instructions for this dataset:
# Simply select ALL the lines in this script by pressing 
# CTRL+A on Windows or CMND+A on a Mac and execute them
# Once you have executed the commands the following objects
# will be created:
# Matrices:
# - FieldGoalAttempts
# - FieldGoals
# - Games
# - MinutesPlayed
# - Salary
# Vectors:
# - Players
# - Seasons
#We will go understand these inside the course.
#
#Sincerely,
#Kirill Eremenko
#www.superdatascience.com

#Copyright: These datasets were prepared using publicly available data.
#           However, theses scripts are subject to Copyright Laws. 
#           If you wish to use these R scripts outside of the R Programming Course
#           by Kirill Eremenko, you may do so by referencing www.superdatascience.com in your work.

#Comments:
#Seasons are labeled based on the first year in the season
#E.g. the 2012-2013 season is preseneted as simply 2012

#Notes and Corrections to the data:
#Kevin Durant: 2006 - College Data Used
#Kevin Durant: 2005 - Proxied With 2006 Data
#Derrick Rose: 2012 - Did Not Play
#Derrick Rose: 2007 - College Data Used
#Derrick Rose: 2006 - Proxied With 2007 Data
#Derrick Rose: 2005 - Proxied With 2007 Data

#Seasons
Seasons <- c("2005","2006","2007","2008","2009","2010","2011","2012","2013","2014")

#Players
Players <- c("KobeBryant","JoeJohnson","LeBronJames","CarmeloAnthony","DwightHoward","ChrisBosh","ChrisPaul","KevinDurant","DerrickRose","DwayneWade")

#Salaries
KobeBryant_Salary <- c(15946875,17718750,19490625,21262500,23034375,24806250,25244493,27849149,30453805,23500000)
JoeJohnson_Salary <- c(12000000,12744189,13488377,14232567,14976754,16324500,18038573,19752645,21466718,23180790)
LeBronJames_Salary <- c(4621800,5828090,13041250,14410581,15779912,14500000,16022500,17545000,19067500,20644400)
CarmeloAnthony_Salary <- c(3713640,4694041,13041250,14410581,15779912,17149243,18518574,19450000,22407474,22458000)
DwightHoward_Salary <- c(4493160,4806720,6061274,13758000,15202590,16647180,18091770,19536360,20513178,21436271)
ChrisBosh_Salary <- c(3348000,4235220,12455000,14410581,15779912,14500000,16022500,17545000,19067500,20644400)
ChrisPaul_Salary <- c(3144240,3380160,3615960,4574189,13520500,14940153,16359805,17779458,18668431,20068563)
KevinDurant_Salary <- c(0,0,4171200,4484040,4796880,6053663,15506632,16669630,17832627,18995624)
DerrickRose_Salary <- c(0,0,0,4822800,5184480,5546160,6993708,16402500,17632688,18862875)
DwayneWade_Salary <- c(3031920,3841443,13041250,14410581,15779912,14200000,15691000,17182000,18673000,15000000)
#Matrix
Salary <- rbind(KobeBryant_Salary, JoeJohnson_Salary, LeBronJames_Salary, CarmeloAnthony_Salary, DwightHoward_Salary, ChrisBosh_Salary, ChrisPaul_Salary, KevinDurant_Salary, DerrickRose_Salary, DwayneWade_Salary)
rm(KobeBryant_Salary, JoeJohnson_Salary, CarmeloAnthony_Salary, DwightHoward_Salary, ChrisBosh_Salary, LeBronJames_Salary, ChrisPaul_Salary, DerrickRose_Salary, DwayneWade_Salary, KevinDurant_Salary)
colnames(Salary) <- Seasons
rownames(Salary) <- Players

#Games 
KobeBryant_G <- c(80,77,82,82,73,82,58,78,6,35)
JoeJohnson_G <- c(82,57,82,79,76,72,60,72,79,80)
LeBronJames_G <- c(79,78,75,81,76,79,62,76,77,69)
CarmeloAnthony_G <- c(80,65,77,66,69,77,55,67,77,40)
DwightHoward_G <- c(82,82,82,79,82,78,54,76,71,41)
ChrisBosh_G <- c(70,69,67,77,70,77,57,74,79,44)
ChrisPaul_G <- c(78,64,80,78,45,80,60,70,62,82)
KevinDurant_G <- c(35,35,80,74,82,78,66,81,81,27)
DerrickRose_G <- c(40,40,40,81,78,81,39,0,10,51)
DwayneWade_G <- c(75,51,51,79,77,76,49,69,54,62)
#Matrix
Games <- rbind(KobeBryant_G, JoeJohnson_G, LeBronJames_G, CarmeloAnthony_G, DwightHoward_G, ChrisBosh_G, ChrisPaul_G, KevinDurant_G, DerrickRose_G, DwayneWade_G)
rm(KobeBryant_G, JoeJohnson_G, CarmeloAnthony_G, DwightHoward_G, ChrisBosh_G, LeBronJames_G, ChrisPaul_G, DerrickRose_G, DwayneWade_G, KevinDurant_G)
colnames(Games) <- Seasons
rownames(Games) <- Players

#Minutes Played
KobeBryant_MP <- c(3277,3140,3192,2960,2835,2779,2232,3013,177,1207)
JoeJohnson_MP <- c(3340,2359,3343,3124,2886,2554,2127,2642,2575,2791)
LeBronJames_MP <- c(3361,3190,3027,3054,2966,3063,2326,2877,2902,2493)
CarmeloAnthony_MP <- c(2941,2486,2806,2277,2634,2751,1876,2482,2982,1428)
DwightHoward_MP <- c(3021,3023,3088,2821,2843,2935,2070,2722,2396,1223)
ChrisBosh_MP <- c(2751,2658,2425,2928,2526,2795,2007,2454,2531,1556)
ChrisPaul_MP <- c(2808,2353,3006,3002,1712,2880,2181,2335,2171,2857)
KevinDurant_MP <- c(1255,1255,2768,2885,3239,3038,2546,3119,3122,913)
DerrickRose_MP <- c(1168,1168,1168,3000,2871,3026,1375,0,311,1530)
DwayneWade_MP <- c(2892,1931,1954,3048,2792,2823,1625,2391,1775,1971)
#Matrix
MinutesPlayed <- rbind(KobeBryant_MP, JoeJohnson_MP, LeBronJames_MP, CarmeloAnthony_MP, DwightHoward_MP, ChrisBosh_MP, ChrisPaul_MP, KevinDurant_MP, DerrickRose_MP, DwayneWade_MP)
rm(KobeBryant_MP, JoeJohnson_MP, CarmeloAnthony_MP, DwightHoward_MP, ChrisBosh_MP, LeBronJames_MP, ChrisPaul_MP, DerrickRose_MP, DwayneWade_MP, KevinDurant_MP)
colnames(MinutesPlayed) <- Seasons
rownames(MinutesPlayed) <- Players

#Field Goals
KobeBryant_FG <- c(978,813,775,800,716,740,574,738,31,266)
JoeJohnson_FG <- c(632,536,647,620,635,514,423,445,462,446)
LeBronJames_FG <- c(875,772,794,789,768,758,621,765,767,624)
CarmeloAnthony_FG <- c(756,691,728,535,688,684,441,669,743,358)
DwightHoward_FG <- c(468,526,583,560,510,619,416,470,473,251)
ChrisBosh_FG <- c(549,543,507,615,600,524,393,485,492,343)
ChrisPaul_FG <- c(407,381,630,631,314,430,425,412,406,568)
KevinDurant_FG <- c(306,306,587,661,794,711,643,731,849,238)
DerrickRose_FG <- c(208,208,208,574,672,711,302,0,58,338)
DwayneWade_FG <- c(699,472,439,854,719,692,416,569,415,509)
#Matrix
FieldGoals <- rbind(KobeBryant_FG, JoeJohnson_FG, LeBronJames_FG, CarmeloAnthony_FG, DwightHoward_FG, ChrisBosh_FG, ChrisPaul_FG, KevinDurant_FG, DerrickRose_FG, DwayneWade_FG)
rm(KobeBryant_FG, JoeJohnson_FG, LeBronJames_FG, CarmeloAnthony_FG, DwightHoward_FG, ChrisBosh_FG, ChrisPaul_FG, KevinDurant_FG, DerrickRose_FG, DwayneWade_FG)
colnames(FieldGoals) <- Seasons
rownames(FieldGoals) <- Players

#Field Goal Attempts
KobeBryant_FGA <- c(2173,1757,1690,1712,1569,1639,1336,1595,73,713)
JoeJohnson_FGA <- c(1395,1139,1497,1420,1386,1161,931,1052,1018,1025)
LeBronJames_FGA <- c(1823,1621,1642,1613,1528,1485,1169,1354,1353,1279)
CarmeloAnthony_FGA <- c(1572,1453,1481,1207,1502,1503,1025,1489,1643,806)
DwightHoward_FGA <- c(881,873,974,979,834,1044,726,813,800,423)
ChrisBosh_FGA <- c(1087,1094,1027,1263,1158,1056,807,907,953,745)
ChrisPaul_FGA <- c(947,871,1291,1255,637,928,890,856,870,1170)
KevinDurant_FGA <- c(647,647,1366,1390,1668,1538,1297,1433,1688,467)
DerrickRose_FGA <- c(436,436,436,1208,1373,1597,695,0,164,835)
DwayneWade_FGA <- c(1413,962,937,1739,1511,1384,837,1093,761,1084)
#Matrix
FieldGoalAttempts <- rbind(KobeBryant_FGA, JoeJohnson_FGA, LeBronJames_FGA, CarmeloAnthony_FGA, DwightHoward_FGA, ChrisBosh_FGA, ChrisPaul_FGA, KevinDurant_FGA, DerrickRose_FGA, DwayneWade_FGA)
rm(KobeBryant_FGA, JoeJohnson_FGA, LeBronJames_FGA, CarmeloAnthony_FGA, DwightHoward_FGA, ChrisBosh_FGA, ChrisPaul_FGA, KevinDurant_FGA, DerrickRose_FGA, DwayneWade_FGA)
colnames(FieldGoalAttempts) <- Seasons
rownames(FieldGoalAttempts) <- Players

#Points
KobeBryant_PTS <- c(2832,2430,2323,2201,1970,2078,1616,2133,83,782)
JoeJohnson_PTS <- c(1653,1426,1779,1688,1619,1312,1129,1170,1245,1154)
LeBronJames_PTS <- c(2478,2132,2250,2304,2258,2111,1683,2036,2089,1743)
CarmeloAnthony_PTS <- c(2122,1881,1978,1504,1943,1970,1245,1920,2112,966)
DwightHoward_PTS <- c(1292,1443,1695,1624,1503,1784,1113,1296,1297,646)
ChrisBosh_PTS <- c(1572,1561,1496,1746,1678,1438,1025,1232,1281,928)
ChrisPaul_PTS <- c(1258,1104,1684,1781,841,1268,1189,1186,1185,1564)
KevinDurant_PTS <- c(903,903,1624,1871,2472,2161,1850,2280,2593,686)
DerrickRose_PTS <- c(597,597,597,1361,1619,2026,852,0,159,904)
DwayneWade_PTS <- c(2040,1397,1254,2386,2045,1941,1082,1463,1028,1331)
#Matrix
Points <- rbind(KobeBryant_PTS, JoeJohnson_PTS, LeBronJames_PTS, CarmeloAnthony_PTS, DwightHoward_PTS, ChrisBosh_PTS, ChrisPaul_PTS, KevinDurant_PTS, DerrickRose_PTS, DwayneWade_PTS)
rm(KobeBryant_PTS, JoeJohnson_PTS, LeBronJames_PTS, CarmeloAnthony_PTS, DwightHoward_PTS, ChrisBosh_PTS, ChrisPaul_PTS, KevinDurant_PTS, DerrickRose_PTS, DwayneWade_PTS)
colnames(Points) <- Seasons
rownames(Points) <- Players

continue

Salary

##                    2005     2006     2007     2008     2009     2010     2011
## KobeBryant     15946875 17718750 19490625 21262500 23034375 24806250 25244493
## JoeJohnson     12000000 12744189 13488377 14232567 14976754 16324500 18038573
## LeBronJames     4621800  5828090 13041250 14410581 15779912 14500000 16022500
## CarmeloAnthony  3713640  4694041 13041250 14410581 15779912 17149243 18518574
## DwightHoward    4493160  4806720  6061274 13758000 15202590 16647180 18091770
## ChrisBosh       3348000  4235220 12455000 14410581 15779912 14500000 16022500
## ChrisPaul       3144240  3380160  3615960  4574189 13520500 14940153 16359805
## KevinDurant           0        0  4171200  4484040  4796880  6053663 15506632
## DerrickRose           0        0        0  4822800  5184480  5546160  6993708
## DwayneWade      3031920  3841443 13041250 14410581 15779912 14200000 15691000
##                    2012     2013     2014
## KobeBryant     27849149 30453805 23500000
## JoeJohnson     19752645 21466718 23180790
## LeBronJames    17545000 19067500 20644400
## CarmeloAnthony 19450000 22407474 22458000
## DwightHoward   19536360 20513178 21436271
## ChrisBosh      17545000 19067500 20644400
## ChrisPaul      17779458 18668431 20068563
## KevinDurant    16669630 17832627 18995624
## DerrickRose    16402500 17632688 18862875
## DwayneWade     17182000 18673000 15000000

Matrices

# A[3,4] to locate the target in matrix

Building Your First Matrix

#matrix(), cbind(), rbind()
my.data = 1: 20
A = matrix(my.data, 4, 5)
A

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    5    9   13   17
## [2,]    2    6   10   14   18
## [3,]    3    7   11   15   19
## [4,]    4    8   12   16   20

A[2,3]

## [1] 10

B = matrix(my.data, 4, 5, byrow = T)
B

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    2    3    4    5
## [2,]    6    7    8    9   10
## [3,]   11   12   13   14   15
## [4,]   16   17   18   19   20

B[2,5]

## [1] 10

#rbind
r1 = c('I', 'am', 'happy')
r2 = c('What', 'a', 'day')
r3 = c(1,2,3)
C = rbind(r1, r2, r3)
C

##    [,1]   [,2] [,3]   
## r1 "I"    "am" "happy"
## r2 "What" "a"  "day"  
## r3 "1"    "2"  "3"

c1 = 1:5
c2 = -1:-5
D = cbind(c1,c2)
D

##      c1 c2
## [1,]  1 -1
## [2,]  2 -2
## [3,]  3 -3
## [4,]  4 -4
## [5,]  5 -5

Naming Dimensions

# rownames() colnames(). V['rowname','colname']?

Colnames() and Rownames()

# name vectors
Charlie = 1: 5
Charlie

## [1] 1 2 3 4 5

#give name
names(Charlie) #return NULL

## NULL

names(Charlie) = c('a','b','c','d','e')
Charlie

## a b c d e 
## 1 2 3 4 5

Charlie['d']

## d 
## 4

names(Charlie)

## [1] "a" "b" "c" "d" "e"

#clear names
names(Charlie) = NULL
#=========================================
#naming matrix dimensions 1

rep(c('a','b','zZ'),times = 3)

## [1] "a"  "b"  "zZ" "a"  "b"  "zZ" "a"  "b"  "zZ"

temp.vec = rep(c('a','b','zZ'),each = 3)
temp.vec

## [1] "a"  "a"  "a"  "b"  "b"  "b"  "zZ" "zZ" "zZ"

Bravo = matrix(temp.vec, 3,3)
Bravo

##      [,1] [,2] [,3]
## [1,] "a"  "b"  "zZ"
## [2,] "a"  "b"  "zZ"
## [3,] "a"  "b"  "zZ"

rownames(Bravo)# NULL

## NULL

rownames(Bravo) = c('How','are','you')
colnames(Bravo) = c('X','Y','Z')
Bravo

##     X   Y   Z   
## How "a" "b" "zZ"
## are "a" "b" "zZ"
## you "a" "b" "zZ"

#----------------
Bravo['are','Y']

## [1] "b"

#change number
Bravo['are','Y'] = 0
Bravo

##     X   Y   Z   
## How "a" "b" "zZ"
## are "a" "0" "zZ"
## you "a" "b" "zZ"

Matrix Operations

#basketball data
Games

##                2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant       80   77   82   82   73   82   58   78    6   35
## JoeJohnson       82   57   82   79   76   72   60   72   79   80
## LeBronJames      79   78   75   81   76   79   62   76   77   69
## CarmeloAnthony   80   65   77   66   69   77   55   67   77   40
## DwightHoward     82   82   82   79   82   78   54   76   71   41
## ChrisBosh        70   69   67   77   70   77   57   74   79   44
## ChrisPaul        78   64   80   78   45   80   60   70   62   82
## KevinDurant      35   35   80   74   82   78   66   81   81   27
## DerrickRose      40   40   40   81   78   81   39    0   10   51
## DwayneWade       75   51   51   79   77   76   49   69   54   62

rownames(Games)

##  [1] "KobeBryant"     "JoeJohnson"     "LeBronJames"    "CarmeloAnthony"
##  [5] "DwightHoward"   "ChrisBosh"      "ChrisPaul"      "KevinDurant"   
##  [9] "DerrickRose"    "DwayneWade"

colnames(Games)

##  [1] "2005" "2006" "2007" "2008" "2009" "2010" "2011" "2012" "2013" "2014"

Games['LeBronJames','2012']

## [1] 76

round(FieldGoals / Games, 1)

##                2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant     12.2 10.6  9.5  9.8  9.8  9.0  9.9  9.5  5.2  7.6
## JoeJohnson      7.7  9.4  7.9  7.8  8.4  7.1  7.0  6.2  5.8  5.6
## LeBronJames    11.1  9.9 10.6  9.7 10.1  9.6 10.0 10.1 10.0  9.0
## CarmeloAnthony  9.4 10.6  9.5  8.1 10.0  8.9  8.0 10.0  9.6  8.9
## DwightHoward    5.7  6.4  7.1  7.1  6.2  7.9  7.7  6.2  6.7  6.1
## ChrisBosh       7.8  7.9  7.6  8.0  8.6  6.8  6.9  6.6  6.2  7.8
## ChrisPaul       5.2  6.0  7.9  8.1  7.0  5.4  7.1  5.9  6.5  6.9
## KevinDurant     8.7  8.7  7.3  8.9  9.7  9.1  9.7  9.0 10.5  8.8
## DerrickRose     5.2  5.2  5.2  7.1  8.6  8.8  7.7  NaN  5.8  6.6
## DwayneWade      9.3  9.3  8.6 10.8  9.3  9.1  8.5  8.2  7.7  8.2

round(MinutesPlayed / Games)

##                2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant       41   41   39   36   39   34   38   39   30   34
## JoeJohnson       41   41   41   40   38   35   35   37   33   35
## LeBronJames      43   41   40   38   39   39   38   38   38   36
## CarmeloAnthony   37   38   36   34   38   36   34   37   39   36
## DwightHoward     37   37   38   36   35   38   38   36   34   30
## ChrisBosh        39   39   36   38   36   36   35   33   32   35
## ChrisPaul        36   37   38   38   38   36   36   33   35   35
## KevinDurant      36   36   35   39   40   39   39   39   39   34
## DerrickRose      29   29   29   37   37   37   35  NaN   31   30
## DwayneWade       39   38   38   39   36   37   33   35   33   32

Visualizing With Matplot()

t(FieldGoals) # flip table

##      KobeBryant JoeJohnson LeBronJames CarmeloAnthony DwightHoward ChrisBosh
## 2005        978        632         875            756          468       549
## 2006        813        536         772            691          526       543
## 2007        775        647         794            728          583       507
## 2008        800        620         789            535          560       615
## 2009        716        635         768            688          510       600
## 2010        740        514         758            684          619       524
## 2011        574        423         621            441          416       393
## 2012        738        445         765            669          470       485
## 2013         31        462         767            743          473       492
## 2014        266        446         624            358          251       343
##      ChrisPaul KevinDurant DerrickRose DwayneWade
## 2005       407         306         208        699
## 2006       381         306         208        472
## 2007       630         587         208        439
## 2008       631         661         574        854
## 2009       314         794         672        719
## 2010       430         711         711        692
## 2011       425         643         302        416
## 2012       412         731           0        569
## 2013       406         849          58        415
## 2014       568         238         338        509

matplot(t(Salary), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players, col = c(1:4, 6), pch = 15:18, horiz = F)

Subsetting

x = c('a','b','c','d','e')
x[c(1,5)]

## [1] "a" "e"

x[1]

## [1] "a"

#-----------
Games

##                2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant       80   77   82   82   73   82   58   78    6   35
## JoeJohnson       82   57   82   79   76   72   60   72   79   80
## LeBronJames      79   78   75   81   76   79   62   76   77   69
## CarmeloAnthony   80   65   77   66   69   77   55   67   77   40
## DwightHoward     82   82   82   79   82   78   54   76   71   41
## ChrisBosh        70   69   67   77   70   77   57   74   79   44
## ChrisPaul        78   64   80   78   45   80   60   70   62   82
## KevinDurant      35   35   80   74   82   78   66   81   81   27
## DerrickRose      40   40   40   81   78   81   39    0   10   51
## DwayneWade       75   51   51   79   77   76   49   69   54   62

Games[1:3, 6:10]

##             2010 2011 2012 2013 2014
## KobeBryant    82   58   78    6   35
## JoeJohnson    72   60   72   79   80
## LeBronJames   79   62   76   77   69

# see a specific person
Games[c(1,10),]

##            2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant   80   77   82   82   73   82   58   78    6   35
## DwayneWade   75   51   51   79   77   76   49   69   54   62

Games[,c('2008','2009')]

##                2008 2009
## KobeBryant       82   73
## JoeJohnson       79   76
## LeBronJames      81   76
## CarmeloAnthony   66   69
## DwightHoward     79   82
## ChrisBosh        77   70
## ChrisPaul        78   45
## KevinDurant      74   82
## DerrickRose      81   78
## DwayneWade       79   77

Games[1,] # no names

## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 
##   80   77   82   82   73   82   58   78    6   35

is.matrix(Games[1,])

## [1] FALSE

is.vector(Games[1,])

## [1] TRUE

Games[1,5]

## [1] 73

Games[1,,drop = F] # turn vector to matrix

##            2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant   80   77   82   82   73   82   58   78    6   35

Visualize subseting

#subset to just see top x
Data = MinutesPlayed[1:3,]

matplot(t(Data), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players[1:3], col = c(1:4, 6), pch = 15:18, horiz = F)

#subset to just see a player
data2 = MinutesPlayed[1,,drop = F]

matplot(t(data2), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players[1], col = c(1:4, 6), pch = 15:18, horiz = F)

Creating Your First Function

#how to create function for the code below?
matplot(t(data2), type ='b', pch = 15: 18, col=c(1:4, 6))
legend('bottomleft', inset = 0.01, legend = Players[1], col = c(1:4, 6), pch = 15:18, horiz = F)

#---------

plot = function(data, rows = 1:10){
  data2 = data[rows,,drop = F]
  matplot(t(data2), type ='b', pch = 15: 18, col=c(1:4, 6))
  legend('bottomleft', inset = 0.01, legend = Players[rows], col = c(1:4, 6), pch = 15:18, horiz = F)
}

#type data and rows below
plot(MinutesPlayed/Games, 1)

Basketball Insights

#salary
plot(Salary)

plot(Salary / Games)

plot(Salary / FieldGoals)

# In-game matrics
plot(MinutesPlayed)

plot(Points)

# In-game matrices normalized
plot(FieldGoals / Games)

plot(FieldGoals / FieldGoalAttempts)

plot(FieldGoalAttempts/Games)

plot(Points/Games)

# interesting observation
plot(MinutesPlayed/Games)

plot(Games)

# time is valuable
plot(FieldGoals/MinutesPlayed)

#player style
plot(Points/FieldGoals)

Homework chapter 4

#Dear Student,
#
#Welcome to the dataset for the homework exercise.
#
#Instructions for this dataset:
# You have only been supplied vectors. You will need
# to create the matrices yourself.
# Matrices:
# - FreeThrows
# - FreeThrowAttempts
#
#Sincerely,
#Kirill Eremenko
#www.superdatascience.com

#Copyright: These datasets were prepared using publicly available data.
#           However, theses scripts are subject to Copyright Laws. 
#           If you wish to use these R scripts outside of the R Programming Course
#           by Kirill Eremenko, you may do so by referencing www.superdatascience.com in your work.

#Comments:
#Seasons are labeled based on the first year in the season
#E.g. the 2012-2013 season is preseneted as simply 2012

#Notes and Corrections to the data:
#Kevin Durant: 2006 - College Data Used
#Kevin Durant: 2005 - Proxied With 2006 Data
#Derrick Rose: 2012 - Did Not Play
#Derrick Rose: 2007 - College Data Used
#Derrick Rose: 2006 - Proxied With 2007 Data
#Derrick Rose: 2005 - Proxied With 2007 Data

#Seasons
Seasons <- c("2005","2006","2007","2008","2009","2010","2011","2012","2013","2014")

#Players
Players <- c("KobeBryant","JoeJohnson","LeBronJames","CarmeloAnthony","DwightHoward","ChrisBosh","ChrisPaul","KevinDurant","DerrickRose","DwayneWade")

#Free Throws
KobeBryant_FT <- c(696,667,623,483,439,483,381,525,18,196)
JoeJohnson_FT <- c(261,235,316,299,220,195,158,132,159,141)
LeBronJames_FT <- c(601,489,549,594,593,503,387,403,439,375)
CarmeloAnthony_FT <- c(573,459,464,371,508,507,295,425,459,189)
DwightHoward_FT <- c(356,390,529,504,483,546,281,355,349,143)
ChrisBosh_FT <- c(474,463,472,504,470,384,229,241,223,179)
ChrisPaul_FT <- c(394,292,332,455,161,337,260,286,295,289)
KevinDurant_FT <- c(209,209,391,452,756,594,431,679,703,146)
DerrickRose_FT <- c(146,146,146,197,259,476,194,0,27,152)
DwayneWade_FT <- c(629,432,354,590,534,494,235,308,189,284)
#Matrix
#
# <put your code here>
#

#Free Throw Attempts
KobeBryant_FTA <- c(819,768,742,564,541,583,451,626,21,241)
JoeJohnson_FTA <- c(330,314,379,362,269,243,186,161,195,176)
LeBronJames_FTA <- c(814,701,771,762,773,663,502,535,585,528)
CarmeloAnthony_FTA <- c(709,568,590,468,612,605,367,512,541,237)
DwightHoward_FTA <- c(598,666,897,849,816,916,572,721,638,271)
ChrisBosh_FTA <- c(581,590,559,617,590,471,279,302,272,232)
ChrisPaul_FTA <- c(465,357,390,524,190,384,302,323,345,321)
KevinDurant_FTA <- c(256,256,448,524,840,675,501,750,805,171)
DerrickRose_FTA <- c(205,205,205,250,338,555,239,0,32,187)
DwayneWade_FTA <- c(803,535,467,771,702,652,297,425,258,370)
#Matrix
#
#
#

#Matrix for Free Throws
#Bind the given vectors to form the matrix
FreeThrows <- rbind(KobeBryant_FT, JoeJohnson_FT, LeBronJames_FT, CarmeloAnthony_FT, DwightHoward_FT, ChrisBosh_FT, ChrisPaul_FT, KevinDurant_FT, DerrickRose_FT, DwayneWade_FT)
#Remove vectors - we don't need them anymore
rm(KobeBryant_FT, JoeJohnson_FT, CarmeloAnthony_FT, DwightHoward_FT, ChrisBosh_FT, LeBronJames_FT, ChrisPaul_FT, DerrickRose_FT, DwayneWade_FT, KevinDurant_FT)
#Rename the columns
colnames(FreeThrows) = Seasons
#Rename the rows
rownames(FreeThrows) = Players

#Check the matrix
FreeThrows

##                2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant      696  667  623  483  439  483  381  525   18  196
## JoeJohnson      261  235  316  299  220  195  158  132  159  141
## LeBronJames     601  489  549  594  593  503  387  403  439  375
## CarmeloAnthony  573  459  464  371  508  507  295  425  459  189
## DwightHoward    356  390  529  504  483  546  281  355  349  143
## ChrisBosh       474  463  472  504  470  384  229  241  223  179
## ChrisPaul       394  292  332  455  161  337  260  286  295  289
## KevinDurant     209  209  391  452  756  594  431  679  703  146
## DerrickRose     146  146  146  197  259  476  194    0   27  152
## DwayneWade      629  432  354  590  534  494  235  308  189  284

#Matrix for Free Throw Attempts
#Bind the given vectors to form the matrix
FreeThrowAttempts <- rbind(KobeBryant_FTA, JoeJohnson_FTA, LeBronJames_FTA, CarmeloAnthony_FTA, DwightHoward_FTA, ChrisBosh_FTA, ChrisPaul_FTA, KevinDurant_FTA, DerrickRose_FTA, DwayneWade_FTA)
#Remove vectors - we don't need them anymore
rm(KobeBryant_FTA, JoeJohnson_FTA, CarmeloAnthony_FTA, DwightHoward_FTA, ChrisBosh_FTA, LeBronJames_FTA, ChrisPaul_FTA, DerrickRose_FTA, DwayneWade_FTA, KevinDurant_FTA)
#Rename the columns
colnames(FreeThrowAttempts) <- Seasons
#Rename the rows
rownames(FreeThrowAttempts) <- Players

#Check the matrix
FreeThrowAttempts

##                2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## KobeBryant      819  768  742  564  541  583  451  626   21  241
## JoeJohnson      330  314  379  362  269  243  186  161  195  176
## LeBronJames     814  701  771  762  773  663  502  535  585  528
## CarmeloAnthony  709  568  590  468  612  605  367  512  541  237
## DwightHoward    598  666  897  849  816  916  572  721  638  271
## ChrisBosh       581  590  559  617  590  471  279  302  272  232
## ChrisPaul       465  357  390  524  190  384  302  323  345  321
## KevinDurant     256  256  448  524  840  675  501  750  805  171
## DerrickRose     205  205  205  250  338  555  239    0   32  187
## DwayneWade      803  535  467  771  702  652  297  425  258  370

#Re-create the plotting function
myplot <- function(z, who=1:10) {
  matplot(t(z[who,,drop=F]), type="b", pch=15:18, col=c(1:4,6), main="Basketball Players Analysis")
  legend("bottomleft", inset=0.01, legend=Players[who], col=c(1:4,6), pch=15:18, horiz=F)
}

#Visualize the new matrices
myplot(FreeThrows)

myplot(FreeThrowAttempts)

#Part 1 - Free Throw Attempts Per Game 
#(You will need the Games matrix)
myplot(FreeThrowAttempts/Games)

#Notice how Chris Paul gets few attempts per game

#Part 2 - Free Throw Accuracy
myplot(FreeThrows/FreeThrowAttempts)

#And yet Chris Paul's accuracy is one of the highest
#Chances are his team would get more points if he had more FTA's
#Also notice that Dwight Howard's FT Accuracy is extremely poor
#compared to other players. If you recall, Dwight Howard's
#Field Goal Accuracy was exceptional:
myplot(FieldGoals/FieldGoalAttempts)

#How could this be? Why is there such a drastic difference?
#We will see just now...

#Part 3 - Player Style Patterns Excluding Free Throws
myplot((Points-FreeThrows)/FieldGoals)

#Because we have excluded free throws, this plot now shows us
#the true representation of player style change. We can verify
#that this is the case because all the marks without exception
#on this plot are between 2 and 3. That is because Field Goals
#can only be for either 2 points or 3 points.
#Insights:
#1. You can see how players' preference for 2 or 3 point shots
#   changes throughout their career. We can see that almost all
#   players in this dataset experiment with their style throughout
#   their careers. Perhaps, the most drastic change in style has
#   been experienced by Joe Johnson.
#2. There is one exception. You can see that one player has not
#   changed his style at all - almost always scoring only 2-pointers.
#   Who is this mystert player? It's Dwight Howard! 
#   Now that explains a lot. The reason that Dwight Howard's
#   Field Goal accuracy is so good is because he almost always
#   scores 2-pointers only. That means he can be close to the basket
#   or even in contact with it. Free throws, on the other hand require
#   the player to stand 15ft (4.57m) away from the hoop. That's 
#   probably why Dwight Howard's Free Throw Accuracy is poor.

5. data frames

brief: demographic analysis

#get data from ... like before

Importing data into R

# get data
#1. select the file manually

# stats = read.csv(file.choose())
# head(stats)

#2. set WD and read data
getwd()

## [1] "/home/jupyter-yangbdm/new folder/R-program"

#windows: 
setwd("/home/jupyter-yangbdm/new folder/R-program")
# factor some character variables
stats = read.csv("P2-Demographic-Data.csv", stringsAsFactors = T)

Exploring your dataset

nrow(stats)

## [1] 195

ncol(stats)

## [1] 5

head(stats, n = 5)

tail(stats, n = 5)

#--------------------
str(stats) # str = structure. runif()run operation if it is working

## 'data.frame':    195 obs. of  5 variables:
##  $ Country.Name  : Factor w/ 195 levels "Afghanistan",..: 8 1 4 2 183 6 7 5 9 10 ...
##  $ Country.Code  : Factor w/ 195 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Birth.rate    : num  10.2 35.3 46 12.9 11 ...
##  $ Internet.users: num  78.9 5.9 19.1 57.2 88 ...
##  $ Income.Group  : Factor w/ 4 levels "High income",..: 1 2 4 4 1 1 3 1 1 1 ...

summary(stats)

##               Country.Name  Country.Code   Birth.rate    Internet.users 
##  Afghanistan        :  1   ABW    :  1   Min.   : 7.90   Min.   : 0.90  
##  Albania            :  1   AFG    :  1   1st Qu.:12.12   1st Qu.:14.52  
##  Algeria            :  1   AGO    :  1   Median :19.68   Median :41.00  
##  Angola             :  1   ALB    :  1   Mean   :21.47   Mean   :42.08  
##  Antigua and Barbuda:  1   ARE    :  1   3rd Qu.:29.76   3rd Qu.:66.22  
##  Argentina          :  1   ARG    :  1   Max.   :49.66   Max.   :96.55  
##  (Other)            :189   (Other):189                                  
##               Income.Group
##  High income        :67   
##  Low income         :30   
##  Lower middle income:50   
##  Upper middle income:48   
##                           
##                           
##

Using the $ sign

head(stats)

stats[3,3]

## [1] 45.985

stats[3, "Birth.rate"]

## [1] 45.985

stats[5, "Country.Name"]

## [1] United Arab Emirates
## 195 Levels: Afghanistan Albania Algeria Angola Antigua and Barbuda ... Zimbabwe

#--------------------------
head(stats[, "Internet.users"])

## [1] 78.9  5.9 19.1 57.2 88.0 59.9

head(stats$Internet.users)

## [1] 78.9  5.9 19.1 57.2 88.0 59.9

stats$Internet.users[2]

## [1] 5.9

#--------------------------
levels(stats$Income.Group)

## [1] "High income"         "Low income"          "Lower middle income"
## [4] "Upper middle income"

Basic operations with a Data Frame

stats[1:10,]

stats[3:9,]

#how [] works
stats[1,]

is.data.frame(stats[1,])

## [1] TRUE

is.data.frame(stats[,1])

## [1] FALSE

is.data.frame(stats[,1,drop = F])

## [1] TRUE

# multiply columns
stats$Birth.rate * stats$Internet.users

##   [1]  808.2516  207.9927  878.3135  736.5644  971.8720 1061.1884  557.6052
##   [8] 1042.7398 1095.6000  757.8167 1074.2100   57.3963  920.3062  178.5560
##  [15]  369.0141  133.5415  488.1658 1353.6006 1104.4080  523.6930  677.1250
##  [22]  775.8912  991.1200  895.2778  762.0782  889.7240 1058.1225  542.2066
##  [29]  379.0050  119.2660  935.2200  880.6680  890.1025  554.1800  313.4880
##  [36]  238.3104  244.2726  831.1292  223.1190  810.9375  690.4111  290.4720
##  [43]  926.2500  748.5411  755.9261  715.4450  242.1170  946.2970  972.9882
##  [50]  408.1770  850.2521  824.1408   31.3200  651.8785  817.8200   62.5575
##  [57]  979.2041  759.1773 1007.6135  653.6058  281.1060 1096.0980  577.2756
##  [64]  407.5113   59.7392  595.3500  116.2593  579.9368  508.8636  676.6900
##  [71]  954.1000  541.0605 1137.2406  660.9750  586.1800  384.3554  627.4274
##  [78]  268.6570  668.3239  303.2372  306.3941 1173.7155  536.1050  286.0556
##  [85] 1293.7271 1508.0400  496.9040  502.3340 1108.8860  735.6220 1227.4200
##  [92] 1372.5660  625.6000  166.3416  334.0060  729.0220 1552.5895  338.1375
##  [99]  946.5330  113.6672  353.5125  712.8660  862.9600  391.1997  143.6900
## [106]  691.3743 1059.6744  767.3909  740.6448 1177.2880  546.3450  104.0580
## [113]  945.8127  830.2598  732.1233  154.4830  654.6811   28.9904  700.5610
## [120]  485.5000  214.4070  209.5662  425.1000  199.2680 1125.4308  416.1243
## [127] 1122.0000   84.4237 1521.7100  322.2140  958.3553 1102.6194  278.2759
## [134] 1086.0736 1356.8426  322.4438  866.5104  791.7616  880.2300  187.8435
## [141]  603.3523  798.1200  490.5552  796.5972  931.1224 1018.4820  437.9276
## [148]  897.2040  294.2010 1244.8480  759.9279  504.7823  753.3000  244.6240
## [155]   62.4393  403.8581   65.8365  473.8000  523.4766  794.3510  690.2170
## [162]  786.6143  741.2911 1118.4465  743.2971  937.4400  629.9266  105.2135
## [169]  162.3600  319.5265  492.6720  204.6912   39.3305  889.3150  930.8420
## [176]  867.2400  778.6650  173.8792  704.2788  455.1000  829.2361 1052.5000
## [183]  859.5000  847.9120 1089.3258  484.7100  682.0743  302.1507 1416.3604
## [190]  400.4316  658.9400  969.5250   93.2668  623.2534  660.7275

stats$Birth.rate + stats$Internet.users

##   [1]  89.14400  41.15300  65.08500  70.07700  99.04400  77.61600  55.20800
##   [8]  79.84700  96.20000  90.01880  77.00000  45.45100  93.37020  41.34000
##  [15]  49.65100  26.77200  62.26150 105.04004  87.33900  66.85200  66.67000
##  [22]  56.69200 105.70000  61.17600  65.97100  85.18800  80.90500  48.03400
##  [29]  40.26700  37.57600  96.70000  96.54000  79.88500  57.90000  45.72000
##  [36]  43.63600  43.61100  67.77600  40.82600  59.12500  60.98200  38.33000
##  [43]  86.60000  76.89080  84.31040  92.67000  34.98600 104.62970  67.09800
##  [50]  41.23800  61.42368  57.43200  35.70000  80.73500  89.70000  34.82500
##  [57] 102.21440  57.56300  94.21980  51.31100  39.75500 102.04410  56.63200
##  [64]  45.43100  38.93700  56.52500  40.60300  51.76200  68.36630  54.33400
##  [71]  80.30000  47.16500  82.78900  53.88500  82.10000  39.39300  76.14760
##  [78]  35.94500  81.84390  35.23700  35.39100  93.24770  47.85000  40.29300
##  [85] 109.94680  92.10000  66.95930  50.64000  68.04600  97.91000  76.73000
##  [92]  74.19400  50.20000  31.26200  40.54400  93.37000  96.03500  39.55100
##  [99]  83.92600  38.72100  37.92500  61.63000 103.00000  39.76300  33.73800
## [106]  78.55290 105.07650  85.43440  77.05600  77.02300  57.14100  37.68600
## [113]  65.54700  62.56400  76.46200  47.63800  78.41380  19.71900  71.92600
## [120]  44.27500  45.10500  40.00100  49.90000  44.50900  83.77500  43.83700
## [127]  83.00000  51.36100  78.04500  36.28800 104.15640 106.65340  34.22300
## [134]  95.90000  86.86900  40.48200  63.71000  59.39800  60.79000  35.39900
## [141]  72.44920  84.70000  69.99560  58.48800  73.19300  97.24000  58.56450
## [148]  81.17000  41.68900  81.07600  56.17700  51.63300  90.30000  38.57800
## [155]  38.42900  40.58530  45.39100  60.70000  51.22600  57.53700  55.85500
## [162]  87.98260  82.87560 106.58360  54.79300  69.00000  50.24300  48.04500
## [169]  40.58000  39.98100  46.79200  30.92200  36.85500  60.40900  78.39000
## [176]  63.60000  63.08600  43.91800  59.67400  52.10000  72.06400  96.70000
## [183]  60.70000  68.30600  74.74200  56.00000  59.43700  38.03900  76.99400
## [190]  41.47200  52.94700  67.35000  44.59400  55.87100  54.21500

head(stats)

stats$mycal = stats$Birth.rate * stats$Internet.users
head(stats)

#add column
stats$xyz = 1:5 # not work if not fit the columns replacement has 4 rows, data has 195
head(stats, n = 12)

#remove col
stats$mycal = NULL
stats$xyz = NULL
head(stats)

Filtering a Data Frame

head(stats)

filter = stats$Internet.users < 2 # compare each number with 2
stats[filter, ] #get country that number of internet users is low

stats[stats$Birth.rate > 40,]

stats[stats$Birth.rate > 40 & stats$Internet.users < 2,] # birth > 40, internet < 2

stats[stats$Income.Group == "High income",]

levels(stats$Income.Group)

## [1] "High income"         "Low income"          "Lower middle income"
## [4] "Upper middle income"

#
stats[stats$Country.Name == "Malta",]

n = "United States"
stats[stats$Country.Name == n,]

Introduction to qplot

library(ggplot2)
qplot(data = stats, x = Internet.users, )

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(data = stats, x = Income.Group, y = Birth.rate, size = I(2)) #I increase size circle

#---------------------------
qplot(data = stats, 
      x = Income.Group, 
      y = Birth.rate, 
      size = I(2),
      color = I("blue"))

#boxplot
qplot(data = stats, 
      x = Income.Group, 
      y = Birth.rate,
      geom = "boxplot")

Visualizing With Qplot: Part I

# visual
options(warn=-1)
qplot(data = stats, 
      x = Internet.users, 
      y = Birth.rate
      )

# visual2
options(warn=-1)
qplot(data = stats, 
      x = Internet.users, 
      y = Birth.rate,
      size = I(.75),
      color = I("red"))

# visual3 low income more birthrate, low internet. 
# high income, low birthrate, high internet.
options(warn=-1)
qplot(data = stats, 
      x = Internet.users, 
      y = Birth.rate,
      size = I(2.0),
      color = Income.Group)

Building Dataframes

#Execute below code to generate three new vectors
Countries_2012_Dataset <- c("Aruba","Afghanistan","Angola","Albania","United Arab Emirates","Argentina","Armenia","Antigua and Barbuda","Australia","Austria","Azerbaijan","Burundi","Belgium","Benin","Burkina Faso","Bangladesh","Bulgaria","Bahrain","Bahamas, The","Bosnia and Herzegovina","Belarus","Belize","Bermuda","Bolivia","Brazil","Barbados","Brunei Darussalam","Bhutan","Botswana","Central African Republic","Canada","Switzerland","Chile","China","Cote d'Ivoire","Cameroon","Congo, Rep.","Colombia","Comoros","Cabo Verde","Costa Rica","Cuba","Cayman Islands","Cyprus","Czech Republic","Germany","Djibouti","Denmark","Dominican Republic","Algeria","Ecuador","Egypt, Arab Rep.","Eritrea","Spain","Estonia","Ethiopia","Finland","Fiji","France","Micronesia, Fed. Sts.","Gabon","United Kingdom","Georgia","Ghana","Guinea","Gambia, The","Guinea-Bissau","Equatorial Guinea","Greece","Grenada","Greenland","Guatemala","Guam","Guyana","Hong Kong SAR, China","Honduras","Croatia","Haiti","Hungary","Indonesia","India","Ireland","Iran, Islamic Rep.","Iraq","Iceland","Israel","Italy","Jamaica","Jordan","Japan","Kazakhstan","Kenya","Kyrgyz Republic","Cambodia","Kiribati","Korea, Rep.","Kuwait","Lao PDR","Lebanon","Liberia","Libya","St. Lucia","Liechtenstein","Sri Lanka","Lesotho","Lithuania","Luxembourg","Latvia","Macao SAR, China","Morocco","Moldova","Madagascar","Maldives","Mexico","Macedonia, FYR","Mali","Malta","Myanmar","Montenegro","Mongolia","Mozambique","Mauritania","Mauritius","Malawi","Malaysia","Namibia","New Caledonia","Niger","Nigeria","Nicaragua","Netherlands","Norway","Nepal","New Zealand","Oman","Pakistan","Panama","Peru","Philippines","Papua New Guinea","Poland","Puerto Rico","Portugal","Paraguay","French Polynesia","Qatar","Romania","Russian Federation","Rwanda","Saudi Arabia","Sudan","Senegal","Singapore","Solomon Islands","Sierra Leone","El Salvador","Somalia","Serbia","South Sudan","Sao Tome and Principe","Suriname","Slovak Republic","Slovenia","Sweden","Swaziland","Seychelles","Syrian Arab Republic","Chad","Togo","Thailand","Tajikistan","Turkmenistan","Timor-Leste","Tonga","Trinidad and Tobago","Tunisia","Turkey","Tanzania","Uganda","Ukraine","Uruguay","United States","Uzbekistan","St. Vincent and the Grenadines","Venezuela, RB","Virgin Islands (U.S.)","Vietnam","Vanuatu","West Bank and Gaza","Samoa","Yemen, Rep.","South Africa","Congo, Dem. Rep.","Zambia","Zimbabwe")
Codes_2012_Dataset <- c("ABW","AFG","AGO","ALB","ARE","ARG","ARM","ATG","AUS","AUT","AZE","BDI","BEL","BEN","BFA","BGD","BGR","BHR","BHS","BIH","BLR","BLZ","BMU","BOL","BRA","BRB","BRN","BTN","BWA","CAF","CAN","CHE","CHL","CHN","CIV","CMR","COG","COL","COM","CPV","CRI","CUB","CYM","CYP","CZE","DEU","DJI","DNK","DOM","DZA","ECU","EGY","ERI","ESP","EST","ETH","FIN","FJI","FRA","FSM","GAB","GBR","GEO","GHA","GIN","GMB","GNB","GNQ","GRC","GRD","GRL","GTM","GUM","GUY","HKG","HND","HRV","HTI","HUN","IDN","IND","IRL","IRN","IRQ","ISL","ISR","ITA","JAM","JOR","JPN","KAZ","KEN","KGZ","KHM","KIR","KOR","KWT","LAO","LBN","LBR","LBY","LCA","LIE","LKA","LSO","LTU","LUX","LVA","MAC","MAR","MDA","MDG","MDV","MEX","MKD","MLI","MLT","MMR","MNE","MNG","MOZ","MRT","MUS","MWI","MYS","NAM","NCL","NER","NGA","NIC","NLD","NOR","NPL","NZL","OMN","PAK","PAN","PER","PHL","PNG","POL","PRI","PRT","PRY","PYF","QAT","ROU","RUS","RWA","SAU","SDN","SEN","SGP","SLB","SLE","SLV","SOM","SRB","SSD","STP","SUR","SVK","SVN","SWE","SWZ","SYC","SYR","TCD","TGO","THA","TJK","TKM","TLS","TON","TTO","TUN","TUR","TZA","UGA","UKR","URY","USA","UZB","VCT","VEN","VIR","VNM","VUT","PSE","WSM","YEM","ZAF","COD","ZMB","ZWE")
Regions_2012_Dataset <- c("The Americas","Asia","Africa","Europe","Middle East","The Americas","Asia","The Americas","Oceania","Europe","Asia","Africa","Europe","Africa","Africa","Asia","Europe","Middle East","The Americas","Europe","Europe","The Americas","The Americas","The Americas","The Americas","The Americas","Asia","Asia","Africa","Africa","The Americas","Europe","The Americas","Asia","Africa","Africa","Africa","The Americas","Africa","Africa","The Americas","The Americas","The Americas","Europe","Europe","Europe","Africa","Europe","The Americas","Africa","The Americas","Africa","Africa","Europe","Europe","Africa","Europe","Oceania","Europe","Oceania","Africa","Europe","Asia","Africa","Africa","Africa","Africa","Africa","Europe","The Americas","The Americas","The Americas","Oceania","The Americas","Asia","The Americas","Europe","The Americas","Europe","Asia","Asia","Europe","Middle East","Middle East","Europe","Middle East","Europe","The Americas","Middle East","Asia","Asia","Africa","Asia","Asia","Oceania","Asia","Middle East","Asia","Middle East","Africa","Africa","The Americas","Europe","Asia","Africa","Europe","Europe","Europe","Asia","Africa","Europe","Africa","Asia","The Americas","Europe","Africa","Europe","Asia","Europe","Asia","Africa","Africa","Africa","Africa","Asia","Africa","Oceania","Africa","Africa","The Americas","Europe","Europe","Asia","Oceania","Middle East","Asia","The Americas","The Americas","Asia","Oceania","Europe","The Americas","Europe","The Americas","Oceania","Middle East","Europe","Europe","Africa","Middle East","Africa","Africa","Asia","Oceania","Africa","The Americas","Africa","Europe","Africa","Africa","The Americas","Europe","Europe","Europe","Africa","Africa","Middle East","Africa","Africa","Asia","Asia","Asia","Asia","Oceania","The Americas","Africa","Europe","Africa","Africa","Europe","The Americas","The Americas","Asia","The Americas","The Americas","The Americas","Asia","Oceania","Middle East","Oceania","Middle East","Africa","Africa","Africa","Africa")

#References:
#(c) Kirill Eremenko, www.superdatascience.com

continue

# create a data frame
df = data.frame(Countries_2012_Dataset, 
                Codes_2012_Dataset, 
                Regions_2012_Dataset)


colnames(df) = c("Country","Code","Region")
head(df)

rm(df)
#new way to renames
df = data.frame(Country = Countries_2012_Dataset, 
                Code = Codes_2012_Dataset, 
                Region = Regions_2012_Dataset, stringsAsFactors = T)
head(df)

summary(df)

##                 Country         Code              Region  
##  Afghanistan        :  1   ABW    :  1   Africa      :54  
##  Albania            :  1   AFG    :  1   Asia        :33  
##  Algeria            :  1   AGO    :  1   Europe      :42  
##  Angola             :  1   ALB    :  1   Middle East :14  
##  Antigua and Barbuda:  1   ARE    :  1   Oceania     :13  
##  Argentina          :  1   ARG    :  1   The Americas:39  
##  (Other)            :189   (Other):189

Merging Data Frames

# merge data.frame
head(stats)

head(df)

merged = merge(stats, df, by.x = 'Country.Code', by.y = 'Code') # left join?
head(merged)

merged$Country = NULL
str(merged)

## 'data.frame':    195 obs. of  6 variables:
##  $ Country.Code  : Factor w/ 195 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Country.Name  : Factor w/ 195 levels "Afghanistan",..: 8 1 4 2 183 6 7 5 9 10 ...
##  $ Birth.rate    : num  10.2 35.3 46 12.9 11 ...
##  $ Internet.users: num  78.9 5.9 19.1 57.2 88 ...
##  $ Income.Group  : Factor w/ 4 levels "High income",..: 1 2 4 4 1 1 3 1 1 1 ...
##  $ Region        : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...

tail(merged)

Visualizing With Qplot: Part II

# visual with new split
qplot(data = merged, x = Internet.users, y = Birth.rate)

qplot(data = merged, x = Internet.users, y = Birth.rate, color = Region)

# make it nicer,
# change shapes
qplot(data = merged, 
      x = Internet.users, 
      y = Birth.rate, 
      color = Region,
      size = I(2),
      shape = I(25))

pch = 0,square pch = 1,circle pch = 2,triangle point up pch = 3,plus pch = 4,cross pch = 5,diamond pch = 6,triangle point down pch = 7,square cross pch = 8,star pch = 9,diamond plus pch = 10,circle plus pch = 11,triangles up and down pch = 12,square plus pch = 13,circle cross pch = 14,square and triangle down pch = 15, filled square pch = 16, filled circle pch = 17, filled triangle point-up pch = 18, filled diamond pch = 19, solid circle pch = 20,bullet (smaller circle) pch = 21, filled circle blue pch = 22, filled square blue pch = 23, filled diamond blue pch = 24, filled triangle point-up blue pch = 25, filled triangle point down blue

# transparency
qplot(data = merged, 
      x = Internet.users, 
      y = Birth.rate, 
      color = Region,
      size = I(2),
      shape = I(19),
      alpha = I(0.51)) # use alpha to change transparency 0~1.

# add title
qplot(data = merged, 
      x = Internet.users, 
      y = Birth.rate, 
      color = Region,
      size = I(2),
      shape = I(19),
      alpha = I(0.51),
      main = "Birth rate vs Internet Users")

HOMEWORK: World Trends

#Execute below code to generate three new vectors
Country_Code <- c("ABW","AFG","AGO","ALB","ARE","ARG","ARM","ATG","AUS","AUT","AZE","BDI","BEL","BEN","BFA","BGD","BGR","BHR","BHS","BIH","BLR","BLZ","BOL","BRA","BRB","BRN","BTN","BWA","CAF","CAN","CHE","CHL","CHN","CIV","CMR","COG","COL","COM","CPV","CRI","CUB","CYP","CZE","DEU","DJI","DNK","DOM","DZA","ECU","EGY","ERI","ESP","EST","ETH","FIN","FJI","FRA","FSM","GAB","GBR","GEO","GHA","GIN","GMB","GNB","GNQ","GRC","GRD","GTM","GUM","GUY","HKG","HND","HRV","HTI","HUN","IDN","IND","IRL","IRN","IRQ","ISL","ITA","JAM","JOR","JPN","KAZ","KEN","KGZ","KHM","KIR","KOR","KWT","LAO","LBN","LBR","LBY","LCA","LKA","LSO","LTU","LUX","LVA","MAC","MAR","MDA","MDG","MDV","MEX","MKD","MLI","MLT","MMR","MNE","MNG","MOZ","MRT","MUS","MWI","MYS","NAM","NCL","NER","NGA","NIC","NLD","NOR","NPL","NZL","OMN","PAK","PAN","PER","PHL","PNG","POL","PRI","PRT","PRY","PYF","QAT","ROU","RUS","RWA","SAU","SDN","SEN","SGP","SLB","SLE","SLV","SOM","SSD","STP","SUR","SVK","SVN","SWE","SWZ","SYR","TCD","TGO","THA","TJK","TKM","TLS","TON","TTO","TUN","TUR","TZA","UGA","UKR","URY","USA","UZB","VCT","VEN","VIR","VNM","VUT","WSM","YEM","ZAF","COD","ZMB","ZWE")
Life_Expectancy_At_Birth_1960 <- c(65.5693658536586,32.328512195122,32.9848292682927,62.2543658536585,52.2432195121951,65.2155365853659,65.8634634146342,61.7827317073171,70.8170731707317,68.5856097560976,60.836243902439,41.2360487804878,69.7019512195122,37.2782682926829,34.4779024390244,45.8293170731707,69.2475609756098,52.0893658536585,62.7290487804878,60.2762195121951,67.7080975609756,59.9613658536585,42.1183170731707,54.2054634146342,60.7380487804878,62.5003658536585,32.3593658536585,50.5477317073171,36.4826341463415,71.1331707317073,71.3134146341463,57.4582926829268,43.4658048780488,36.8724146341463,41.523756097561,48.5816341463415,56.716756097561,41.4424390243903,48.8564146341463,60.5761951219512,63.9046585365854,69.5939268292683,70.3487804878049,69.3129512195122,44.0212682926829,72.1765853658537,51.8452682926829,46.1351219512195,53.215,48.0137073170732,37.3629024390244,69.1092682926829,67.9059756097561,38.4057073170732,68.819756097561,55.9584878048781,69.8682926829268,57.5865853658537,39.5701219512195,71.1268292682927,63.4318536585366,45.8314634146342,34.8863902439024,32.0422195121951,37.8404390243902,36.7330487804878,68.1639024390244,59.8159268292683,45.5316341463415,61.2263414634146,60.2787317073171,66.9997073170732,46.2883170731707,64.6086585365854,42.1000975609756,68.0031707317073,48.6403170731707,41.1719512195122,69.691756097561,44.945512195122,48.0306829268293,73.4286585365854,69.1239024390244,64.1918292682927,52.6852682926829,67.6660975609756,58.3675853658537,46.3624146341463,56.1280731707317,41.2320243902439,49.2159756097561,53.0013170731707,60.3479512195122,43.2044634146342,63.2801219512195,34.7831707317073,42.6411951219512,57.303756097561,59.7471463414634,46.5107073170732,69.8473170731707,68.4463902439024,69.7868292682927,64.6609268292683,48.4466341463415,61.8127804878049,39.9746829268293,37.2686341463415,57.0656341463415,60.6228048780488,28.2116097560976,67.6017804878049,42.7363902439024,63.7056097560976,48.3688048780488,35.0037073170732,43.4830975609756,58.7452195121951,37.7736341463415,59.4753414634146,46.8803902439024,58.6390243902439,35.5150487804878,37.1829512195122,46.9988292682927,73.3926829268293,73.549756097561,35.1708292682927,71.2365853658537,42.6670731707317,45.2904634146342,60.8817073170732,47.6915853658537,57.8119268292683,38.462243902439,67.6804878048781,68.7196097560976,62.8089268292683,63.7937073170732,56.3570487804878,61.2060731707317,65.6424390243903,66.0552926829268,42.2492926829268,45.6662682926829,48.1876341463415,38.206,65.6598292682927,49.3817073170732,30.3315365853659,49.9479268292683,36.9658780487805,31.6767073170732,50.4513658536585,59.6801219512195,69.9759268292683,68.9780487804878,73.0056097560976,44.2337804878049,52.768243902439,38.0161219512195,40.2728292682927,54.6993170731707,56.1535365853659,54.4586829268293,33.7271219512195,61.3645365853659,62.6575853658537,42.009756097561,45.3844146341463,43.6538780487805,43.9835609756098,68.2995365853659,67.8963902439025,69.7707317073171,58.8855365853659,57.7238780487805,59.2851219512195,63.7302195121951,59.0670243902439,46.4874878048781,49.969512195122,34.3638048780488,49.0362926829268,41.0180487804878,45.1098048780488,51.5424634146342)
Life_Expectancy_At_Birth_2013 <- c(75.3286585365854,60.0282682926829,51.8661707317073,77.537243902439,77.1956341463415,75.9860975609756,74.5613658536585,75.7786585365854,82.1975609756098,80.890243902439,70.6931463414634,56.2516097560976,80.3853658536585,59.3120243902439,58.2406341463415,71.245243902439,74.4658536585366,76.5459512195122,75.0735365853659,76.2769268292683,72.4707317073171,69.9820487804878,67.9134390243903,74.1224390243903,75.3339512195122,78.5466585365854,69.1029268292683,64.3608048780488,49.8798780487805,81.4011219512195,82.7487804878049,81.1979268292683,75.3530243902439,51.2084634146342,55.0418048780488,61.6663902439024,73.8097317073171,62.9321707317073,72.9723658536585,79.2252195121951,79.2563902439025,79.9497804878049,78.2780487804878,81.0439024390244,61.6864634146342,80.3024390243903,73.3199024390244,74.5689512195122,75.648512195122,70.9257804878049,63.1778780487805,82.4268292682927,76.4243902439025,63.4421951219512,80.8317073170732,69.9179268292683,81.9682926829268,68.9733902439024,63.8435853658537,80.9560975609756,74.079512195122,61.1420731707317,58.216487804878,59.9992682926829,54.8384146341464,57.2908292682927,80.6341463414634,73.1935609756098,71.4863902439024,78.872512195122,66.3100243902439,83.8317073170732,72.9428536585366,77.1268292682927,62.4011463414634,75.2682926829268,68.7046097560976,67.6604146341463,81.0439024390244,75.1259756097561,69.4716829268293,83.1170731707317,82.290243902439,73.4689268292683,73.9014146341463,83.3319512195122,70.45,60.9537804878049,70.2024390243902,67.7720487804878,65.7665853658537,81.459756097561,74.462756097561,65.687243902439,80.1288780487805,60.5203902439024,71.6576829268293,74.9127073170732,74.2402926829268,49.3314634146342,74.1634146341464,81.7975609756098,73.9804878048781,80.3391463414634,73.7090487804878,68.811512195122,64.6739024390244,76.6026097560976,76.5326585365854,75.1870487804878,57.5351951219512,80.7463414634146,65.6540975609756,74.7583658536585,69.0618048780488,54.641512195122,62.8027073170732,74.46,61.466,74.567512195122,64.3438780487805,77.1219512195122,60.8281463414634,52.4421463414634,74.514756097561,81.1048780487805,81.4512195121951,69.222,81.4073170731707,76.8410487804878,65.9636829268293,77.4192195121951,74.2838536585366,68.1315609756097,62.4491707317073,76.8487804878049,78.7111951219512,80.3731707317073,72.7991707317073,76.3340731707317,78.4184878048781,74.4634146341463,71.0731707317073,63.3948292682927,74.1776341463415,63.1670487804878,65.878756097561,82.3463414634146,67.7189268292683,50.3631219512195,72.4981463414634,55.0230243902439,55.2209024390244,66.259512195122,70.99,76.2609756097561,80.2780487804878,81.7048780487805,48.9379268292683,74.7157804878049,51.1914878048781,59.1323658536585,74.2469268292683,69.4001707317073,65.4565609756098,67.5223658536585,72.6403414634147,70.3052926829268,73.6463414634147,75.1759512195122,64.2918292682927,57.7676829268293,71.159512195122,76.8361951219512,78.8414634146341,68.2275853658537,72.8108780487805,74.0744146341464,79.6243902439024,75.756487804878,71.669243902439,73.2503902439024,63.583512195122,56.7365853658537,58.2719268292683,59.2373658536585,55.633)

#References: 
#(c) Kirill Eremenko, www.superdatascience.com

continue

#Set the Working Directory
getwd()

## [1] "/home/jupyter-yangbdm/new folder/R-program"

setwd("/home/jupyter-yangbdm/new folder/R-program")
getwd()

## [1] "/home/jupyter-yangbdm/new folder/R-program"

#Import the csv data set
data <- read.csv("P2-Section5-Homework-Data.csv", stringsAsFactors = T)

#Explore the data
head(data, n = 6)      #check top 6 rows

tail(data, n=7) #check bottom 7 rows

str(data)      #check the structure of the data frame

## 'data.frame':    374 obs. of  5 variables:
##  $ Country.Name  : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
##  $ Country.Code  : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Region        : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
##  $ Year          : int  1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
##  $ Fertility.Rate: num  4.82 7.45 7.38 6.19 6.93 ...

summary(data)      #check the summary of the data

##               Country.Name  Country.Code          Region         Year     
##  Afghanistan        :  2   ABW    :  2   Africa      :106   Min.   :1960  
##  Albania            :  2   AFG    :  2   Asia        : 66   1st Qu.:1960  
##  Algeria            :  2   AGO    :  2   Europe      : 80   Median :1986  
##  Angola             :  2   ALB    :  2   Middle East : 24   Mean   :1986  
##  Antigua and Barbuda:  2   ARE    :  2   Oceania     : 26   3rd Qu.:2013  
##  Argentina          :  2   ARG    :  2   The Americas: 72   Max.   :2013  
##  (Other)            :362   (Other):362                                    
##  Fertility.Rate 
##  Min.   :1.124  
##  1st Qu.:2.243  
##  Median :3.994  
##  Mean   :4.191  
##  3rd Qu.:6.252  
##  Max.   :8.187  
##

#Did you pick up that there is more than one year in the data?
#From the challenge we know that there are two: 1960 and 2013

#Filter the dataframes
data1960 <- data[data$Year==1960,]
data2013 <- data[data$Year==2013,]

#Check row counts
nrow(data1960) #187 rows

## [1] 187

nrow(data2013) #187 rows. Equal split.

## [1] 187

#Create the additional dataframes
add1960 <- data.frame(Code=Country_Code, Life.Exp=Life_Expectancy_At_Birth_1960)
add2013 <- data.frame(Code=Country_Code, Life.Exp=Life_Expectancy_At_Birth_2013)

#Check summaries
summary(add1960)

##      Code              Life.Exp    
##  Length:187         Min.   :28.21  
##  Class :character   1st Qu.:43.47  
##  Mode  :character   Median :54.70  
##                     Mean   :53.73  
##                     3rd Qu.:64.05  
##                     Max.   :73.55

summary(add2013)

##      Code              Life.Exp    
##  Length:187         Min.   :48.94  
##  Class :character   1st Qu.:64.52  
##  Mode  :character   Median :73.25  
##                     Mean   :70.76  
##                     3rd Qu.:76.84  
##                     Max.   :83.83

#Merge the pairs of dataframes  
merged1960 <- merge(data1960, add1960, by.x="Country.Code", by.y="Code")
merged2013 <- merge(data2013, add2013, by.x="Country.Code", by.y="Code")

#Check the new structures
str(merged1960)

## 'data.frame':    187 obs. of  6 variables:
##  $ Country.Code  : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Country.Name  : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
##  $ Region        : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
##  $ Year          : int  1960 1960 1960 1960 1960 1960 1960 1960 1960 1960 ...
##  $ Fertility.Rate: num  4.82 7.45 7.38 6.19 6.93 ...
##  $ Life.Exp      : num  65.6 32.3 33 62.3 52.2 ...

str(merged2013)

## 'data.frame':    187 obs. of  6 variables:
##  $ Country.Code  : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Country.Name  : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
##  $ Region        : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
##  $ Year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ Fertility.Rate: num  1.67 5.05 6.17 1.77 1.8 ...
##  $ Life.Exp      : num  75.3 60 51.9 77.5 77.2 ...

#We can see an obsolete column in each of the merged dataframes
#Column "Year" is no longer required. Let's remove it
merged1960$Year <- NULL
merged2013$Year <- NULL
#Check structures again
str(merged1960)

## 'data.frame':    187 obs. of  5 variables:
##  $ Country.Code  : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Country.Name  : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
##  $ Region        : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
##  $ Fertility.Rate: num  4.82 7.45 7.38 6.19 6.93 ...
##  $ Life.Exp      : num  65.6 32.3 33 62.3 52.2 ...

str(merged2013)

## 'data.frame':    187 obs. of  5 variables:
##  $ Country.Code  : Factor w/ 187 levels "ABW","AFG","AGO",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Country.Name  : Factor w/ 187 levels "Afghanistan",..: 8 1 4 2 176 6 7 5 9 10 ...
##  $ Region        : Factor w/ 6 levels "Africa","Asia",..: 6 2 1 3 4 6 2 6 5 3 ...
##  $ Fertility.Rate: num  1.67 5.05 6.17 1.77 1.8 ...
##  $ Life.Exp      : num  75.3 60 51.9 77.5 77.2 ...

#Visualization time
library(ggplot2)
options(warn=-1)
#Visualize the 1960 dataset
qplot(data=merged1960, x=Fertility.Rate, y=Life.Exp,
      color=Region,                               #colour
      size=I(2), 
      alpha=I(0.6),                               #transparency
      main="Life Expectancy vs Fertility (1960)" #title
   )

#Visualize the 2013 dataset
qplot(data=merged2013, x=Fertility.Rate, y=Life.Exp,
      color=Region,                               #colour
      size=I(2), 
      alpha=I(0.6),                               #transparency
      main="Life Expectancy vs Fertility (1960)" #title
   )

6 Advanced Visualization with GGPlot2

Project Brief: Movie Ratings

# Movie Ratings introduction, no code

Grammar Of Graphics - GGPlot2

# Grammar introduction, no code
# DATA: movie name, budget, genre
# Aesthetics: x axis, y axis, color of columns. 
#Geometries: statistics, histograms group rows by genre or other variables.
#facet: ~, separated charts
#coordinate: x y chart. 
# Theme: title: label, size of pictures, subtitles.

What is a Factor?

movies = read.csv("P2-Movie-Ratings.csv", stringsAsFactors = T)
head(movies)

colnames(movies) = c("Film",'Genre','CriticRating','AudienceRating','BudgetMillions','Year')
str(movies)

## 'data.frame':    562 obs. of  6 variables:
##  $ Film          : Factor w/ 562 levels "(500) Days of Summer ",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Genre         : Factor w/ 7 levels "Action","Adventure",..: 3 2 1 2 3 1 3 5 3 3 ...
##  $ CriticRating  : int  87 9 30 93 55 39 40 50 43 93 ...
##  $ AudienceRating: int  81 44 52 84 70 63 71 57 48 93 ...
##  $ BudgetMillions: int  8 105 20 18 20 200 30 32 28 8 ...
##  $ Year          : int  2009 2008 2009 2010 2009 2009 2008 2007 2011 2011 ...

summary(movies)

##                     Film           Genre      CriticRating  AudienceRating 
##  (500) Days of Summer :  1   Action   :154   Min.   : 0.0   Min.   : 0.00  
##  10,000 B.C.          :  1   Adventure: 29   1st Qu.:25.0   1st Qu.:47.00  
##  12 Rounds            :  1   Comedy   :172   Median :46.0   Median :58.00  
##  127 Hours            :  1   Drama    :101   Mean   :47.4   Mean   :58.83  
##  17 Again             :  1   Horror   : 49   3rd Qu.:70.0   3rd Qu.:72.00  
##  2012                 :  1   Romance  : 21   Max.   :97.0   Max.   :96.00  
##  (Other)              :556   Thriller : 36                                 
##  BudgetMillions       Year     
##  Min.   :  0.0   Min.   :2007  
##  1st Qu.: 20.0   1st Qu.:2008  
##  Median : 35.0   Median :2009  
##  Mean   : 50.1   Mean   :2009  
##  3rd Qu.: 65.0   3rd Qu.:2010  
##  Max.   :300.0   Max.   :2011  
##

# factor(movies$Year)
movies$Year = factor(movies$Year)
str(movies)

## 'data.frame':    562 obs. of  6 variables:
##  $ Film          : Factor w/ 562 levels "(500) Days of Summer ",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Genre         : Factor w/ 7 levels "Action","Adventure",..: 3 2 1 2 3 1 3 5 3 3 ...
##  $ CriticRating  : int  87 9 30 93 55 39 40 50 43 93 ...
##  $ AudienceRating: int  81 44 52 84 70 63 71 57 48 93 ...
##  $ BudgetMillions: int  8 105 20 18 20 200 30 32 28 8 ...
##  $ Year          : Factor w/ 5 levels "2007","2008",..: 3 2 3 4 3 3 2 1 5 5 ...

Aesthetics

# Aesthetics
library(ggplot2)
ggplot(data = movies, aes(x = CriticRating, y = AudienceRating, 
                          color = Genre, #color
                          size = BudgetMillions))+ # size
  geom_point()

Plotting With Layers

# Geometries, create an object
p = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating, 
                          color = Genre,
                          size = BudgetMillions))
#points
p +   geom_point()

#line (not good)
p+geom_line()

# multiple layers (still not good)
p + geom_line()+geom_point()

Overriding Aesthetics

q = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating, 
                          color = Genre,
                          size = BudgetMillions))
#add geom layers
q + geom_point()

# overriding aesthetics
#ex1
q + geom_point(aes(size = CriticRating))

#ex2
q + geom_point(aes(color = BudgetMillions))

#ex3 (inappropriate)
q + geom_point(aes(x = BudgetMillions)) + xlab("Budget Millions $")

#ex4 reduce line size
q + geom_line(size = 1) + geom_point()

Mapping vs Setting

r = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating, 
                          color = Genre,
                          size = BudgetMillions))

# add color
#1 mapping
r + geom_point(aes(color = Genre))

#2. setting
r + geom_point(color = 'Darkgreen')

# ERROR
# r + geom_point(aes(color = 'Darkgreen'))

# Mapping
r + geom_point(aes(size = BudgetMillions))

# Setting
r + geom_point(size = 3)

#error
# r + geom_point(aes(size = 3))

Histograms and Density Charts

s = ggplot(data = movies, aes(x = BudgetMillions))
s + geom_histogram(binwidth = 5)

# add color
s + geom_histogram(binwidth = 5, aes(fill = Genre))

#add border
s + geom_histogram(binwidth = 5, aes(fill = Genre), color = "Black")

# density chart?
s + geom_density(aes(fill = Genre))

s + geom_density(aes(fill = Genre), position = "stack")

Starting Layer Tips

t = ggplot(data = movies, aes(x = AudienceRating))
t + geom_histogram(binwidth = 5, fill = 'White', color = 'Blue')

#another way
t = ggplot(data = movies)
t + geom_histogram(binwidth = 5, 
                   aes(x = AudienceRating), fill = 'White', color = 'Blue')

#4
t + geom_histogram(binwidth = 5, 
                   aes(x = CriticRating), fill = 'White', color = 'Blue')

# 5
t = ggplot() # create a plot with different dataset

Statistical Transformations

u = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating, 
                          color = Genre))
u + geom_point() + geom_smooth(fill = NA)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#boxplot
u = ggplot(data = movies, aes(x = Genre, y = AudienceRating, color = Genre))
u + geom_boxplot(size = 1) + geom_point()

#tip/something else
u + geom_boxplot(size = 1) + geom_jitter()

u + geom_jitter() + geom_boxplot(size = 1, alpha = 0.5)

Using Facets

v = ggplot(data = movies, aes (x = BudgetMillions))
v + geom_histogram(binwidth = 5, aes(fill = Genre), color = 'Black')

# facets
v + geom_histogram(binwidth = 5, aes(fill = Genre), color = 'Black') +
  facet_grid(Genre ~ ., scales = "free")

#scatterplots
w = ggplot(data = movies, aes(x = CriticRating, y = AudienceRating, color = Genre))
w + geom_point(size = 2)

# add facets
w + geom_point(size = 1) + 
  facet_grid(Genre ~.)

#year
# add facets
w + geom_point(size = 1) + 
  facet_grid(.~Year)

# add facets and year
w + geom_point(size = 1) + 
  facet_grid(Genre~Year)

options(warn=-1)
# add facets and year
w + geom_point(size = 1) + 
  geom_smooth() +
  facet_grid(Genre~Year)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# add facets and year with Budget millions
w + geom_point(aes(size = BudgetMillions)) + 
  geom_smooth() +
  facet_grid(Genre~Year)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Coordinates

# how to zoom in and out
m = ggplot(data = movies, aes(x = CriticRating, 
                              y = AudienceRating, 
                              color = Genre, 
                              size = BudgetMillions))
m + geom_point() + 
  xlim(50, 100) + #zoom in
  ylim(50, 100)

# zoom for columns
n = ggplot(data = movies, aes(x = BudgetMillions))
n + geom_histogram(binwidth = 15, aes(fill = Genre),
                    color = 'Black')+
                     coord_cartesian(ylim = c(0,50))

# improve add facets and year with Budget millions
w + geom_point(aes(size = BudgetMillions)) + 
  geom_smooth() +
  facet_grid(Genre~Year) +
  coord_cartesian(ylim = c(0,100))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Perfecting By Adding Themes

o = ggplot(data = movies, aes(x = BudgetMillions))
o + geom_histogram(binwidth = 10, aes(fill = Genre), color = 'Black')

# add labels or axes labels
h = o + geom_histogram(binwidth = 10, aes(fill = Genre), color = 'Black')
h + 
  xlab('Money Axis') + 
  ylab('Number of Movies') +
  theme(axis.title.x = element_text(color = "DarkGreen",size = 10),
        axis.title.y = element_text(color = "Red",size = 10),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10))

# legend format
h + 
  xlab('Money Axis') + 
  ylab('Number of Movies') +
  theme(axis.title.x = element_text(color = "DarkGreen",size = 10),
        axis.title.y = element_text(color = "Red",size = 10),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        legend.title = element_text(size = 15),
        legend.text = element_text(size = 15),
        legend.position = c(1,1),
        legend.justification = c(1,1))

# TITLE
# legend format
h + 
  xlab('Money Axis') + 
  ylab('Number of Movies') +
  ggtitle('Movie Budget Distribution')+
  theme(axis.title.x = element_text(color = "DarkGreen",size = 10),
        axis.title.y = element_text(color = "Red",size = 10),
        axis.text.x = element_text(size = 10),
        axis.text.y = element_text(size = 10),
        legend.title = element_text(size = 15),
        legend.text = element_text(size = 15),
        legend.position = c(1,1),
        legend.justification = c(1,1),
        plot.title = element_text(color = 'Darkblue',
                                  size = 20,
                                  family = 'Courier'))

chapter 6 Homework

mov <- read.csv("Section6-Homework-Data.csv", stringsAsFactors = T)

#Data Exploration
nrow(mov) #top rows

## [1] 608

ncol(mov) #column summaries

## [1] 18

str(mov) #structure of the dataset

## 'data.frame':    608 obs. of  18 variables:
##  $ Day.of.Week           : Factor w/ 6 levels "Friday","Saturday",..: 1 1 1 1 1 1 4 1 1 1 ...
##  $ Director              : Factor w/ 337 levels "Aaron Blaise, Robert A. Walker",..: 31 297 233 256 287 76 276 71 108 126 ...
##  $ Genre                 : Factor w/ 15 levels "action","adventure",..: 1 1 1 5 1 1 2 1 1 10 ...
##  $ Movie.Title           : Factor w/ 608 levels "10,000 B.C.",..: 557 314 466 6 592 161 233 378 128 331 ...
##  $ Release.Date          : Factor w/ 534 levels "1/05/2009","1/05/2015",..: 273 86 121 134 384 159 347 16 28 257 ...
##  $ Studio                : Factor w/ 36 levels "Art House Studios",..: 2 2 11 25 25 25 2 31 31 20 ...
##  $ Adjusted.Gross...mill.: Factor w/ 585 levels "1,003","1,020",..: 50 51 52 53 54 55 56 57 58 59 ...
##  $ Budget...mill.        : num  170 66 100 42 150 80 50 85 70 5 ...
##  $ Gross...mill.         : Factor w/ 561 levels "1,004.60","1,017",..: 30 33 43 27 40 59 63 49 72 45 ...
##  $ IMDb.Rating           : num  6.7 6.6 6.1 7.2 8 5.8 6 6.8 6.3 5.9 ...
##  $ MovieLens.Rating      : num  3.26 2.97 2.93 3.62 3.65 2.85 3.16 3.45 2.92 2.9 ...
##  $ Overseas...mill.      : Factor w/ 551 levels "1,160.60","1,528.10",..: 32 151 172 490 82 66 528 523 150 11 ...
##  $ Overseas.             : num  55.4 78.6 80.9 31.3 64.4 59.5 39.9 39.3 73.9 49.8 ...
##  $ Profit...mill.        : Factor w/ 566 levels "1,015.40","1,025.90",..: 366 47 13 94 494 39 100 28 69 189 ...
##  $ Profit.               : num  18.9 208 106.2 380 36.9 ...
##  $ Runtime..min.         : int  130 132 126 109 131 134 125 115 92 84 ...
##  $ US...mill.            : num  90.2 43.6 39.3 138.4 73.1 ...
##  $ Gross...US            : num  44.6 21.4 19.1 68.7 35.6 40.5 60.1 60.7 26.1 50.2 ...

#Activate GGPlot2
#install.packages("ggplot2")
library(ggplot2)
#{Offtopic} This Is A Cool Insight:
#Notice? No movies are released on a Monday. Ever.
ggplot(data=mov, aes(x=Day.of.Week)) + geom_bar()

#Now we need to filter our dataset to leave onlly the 
#Genres and Studios that we are interested in
#We will start with the Genre filter and use the Logical 'OR'
#operator to select multiple Genres:
filt <- (mov$Genre == "action") | (mov$Genre == "adventure") | (mov$Genre == "animation") | (mov$Genre == "comedy") | (mov$Genre == "drama")

#Now let's do the same for the Studio filter:
filt2 <- mov$Studio %in% c("Buena Vista Studios","WB","Fox","Universal","Sony", "Paramount Pictures")

  
#Apply the row filters to the dataframe
mov2 <- mov[filt & filt2,]
(mov2)

#Prepare the plot's data and aes layers
#Note we did not rename the columns. 
#Use str() or summary() to fin out the correct column names
p <- ggplot(data=mov2, aes(x=Genre, y=Gross...US))
# p #Nothing happens. We need a geom.

#Add a Point Geom Layer
p + 
  geom_point()

#Add a boxplot instead of the points
p + 
  geom_boxplot()

#Notice that outliers are part of the boxplot layer
#We will use this observation later (*)

#Add points
q = p + 
  geom_jitter(aes(size = Budget...mill., color = Studio)) +
  geom_boxplot(alpha = 0.5, outlier.color = NA)
q

# non-data ink
q = q +
  xlab('Genre') +
  ylab('Gross % US') +
  ggtitle('Domestic Gross %')
q

# add theme
q = q + 
  theme(
    axis.title.x = element_text(color = 'Blue', size = 10),
    axis.title.y = element_text(color = 'Blue', size = 10),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    plot.title = element_text(size = 15),
    legend.title = element_text(size = 15),
    legend.text = element_text(size = 15), 
    text = element_text(family = 'Times New Roman') # change text style
  )
q

# final touch
q$labels$size = 'Budget $M'
q