This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

plot(cars)

mean(cars$speed)
## [1] 15.4
mean(cars$dist)
## [1] 42.98
max(cars$dist)
## [1] 120
max(cars$speed)
## [1] 25

First Steps with R activity

#Basic calculations

4+1
## [1] 5
5-2
## [1] 3
2^2
## [1] 4
2^5
## [1] 32
#natural log of 2
log(2.72)
## [1] 1.000632
log10(5)
## [1] 0.69897
#here we are computing log base 10
log(10,base=5)
## [1] 1.430677
log(10,base=2)
## [1] 3.321928
log(1000,base=10)
## [1] 3

#Question_1: Compute the log base 5 of 10 and the log of 10.

log(10,base=5)
## [1] 1.430677
log(10,base=10)
## [1] 1

#Computing Offensive Metrics in Baseball

BA=(29)/(112)
BA
## [1] 0.2589286
Batting_Average=round(BA,digits=3)
Batting_Average
## [1] 0.259

#Question_2:What is the batting average of a player that bats 42 hits in 212 at bats?

BA=(42)/(212)
BA
## [1] 0.1981132
Batting_Average=round(BA,digits=3)
Batting_Average
## [1] 0.198

#Computing the On Base Percentage stats**

#Question_3:Compute the OBP for a player with the following general stats:AB=565,H=156,BB=65,HBP=3,SF=7

OBP=(156+65+3)/(565+156+65+3+7)
OBP
## [1] 0.281407
On_Base_Percentage=round(OBP,digits=3)
On_Base_Percentage
## [1] 0.281

#Testing if something is less than, greater than or equal to something

3 == 8
## [1] FALSE
3 != 8
## [1] TRUE
3<= 8
## [1] TRUE
3>4
## [1] FALSE

#Logical operators are & for logical AND, | for logical OR, and ! for NOT

#Logical Disjunction (or)
FALSE|FALSE
## [1] FALSE
#Logical Conjunction (and)
TRUE & FALSE
## [1] FALSE
#Negation
! FALSE
## [1] TRUE
#Combination of statements
2 < 3 | 1==5
## [1] TRUE

Assigning Values to Variables

#Creating a variable and assigning it a value using <-

Total_Bases <- 6 + 5
Total_Bases*3
## [1] 33

#To see variables that are currently defined, we use ls (list)

ls()
## [1] "BA"                 "Batting_Average"    "OBP"               
## [4] "On_Base_Percentage" "Total_Bases"

#To delete a variable, use rm (remove)

rm(Total_Bases)

Vectors

#Basic type of object in R is a vector, an ordered list of values of the same type. Use the c() function (concatenate)

pitches_by_innings <- c(12, 15, 10, 20, 10)
pitches_by_innings
## [1] 12 15 10 20 10
strikes_by_innings <- c(9, 12, 6, 14, 9)
strikes_by_innings
## [1]  9 12  6 14  9

#Question_4: Define two vectors,runs_per_9innings and hits_per_9innings, each with five elements.

runs_per_9innings <- c(5, 7, 10, 13, 4)
runs_per_9innings
## [1]  5  7 10 13  4
hits_per_9innings <- c(8, 14, 2, 7, 20)
hits_per_9innings
## [1]  8 14  2  7 20

#Functions that will create vectors with regular patterns, like repeated elements

#replicate function
rep(2, 5)
## [1] 2 2 2 2 2
rep(1, 4)
## [1] 1 1 1 1
#consecutive numbers
1:5
## [1] 1 2 3 4 5
2:10
## [1]  2  3  4  5  6  7  8  9 10
# sequence from 1 to 10 with a step of 2
seq(1, 10, by=2)
## [1] 1 3 5 7 9
seq(2, 13, by=3)
## [1]  2  5  8 11

#Many functions and operators like + or - will work on all elements of the vector

# add vectors
pitches_by_innings+strikes_by_innings
## [1] 21 27 16 34 19
# compare vectors
pitches_by_innings == strikes_by_innings
## [1] FALSE FALSE FALSE FALSE FALSE
# find length of vector
length(pitches_by_innings)
## [1] 5
# find minimum value in vector
min(pitches_by_innings)
## [1] 10
# find average value in vector
mean(pitches_by_innings)
## [1] 13.4

#Accessing parts of a vector by using [

pitches_by_innings
## [1] 12 15 10 20 10
#getting the first element
pitches_by_innings[1]
## [1] 12

#Question_5: Get the first element of hits_per_9innings.

hits_per_9innings
## [1]  8 14  2  7 20
hits_per_9innings[1]
## [1] 8

#Getting the last element of pitches_by_innings without explicitly typing the number of elements of pitches_by_innings, make use of the length function

pitches_by_innings[length(pitches_by_innings)]
## [1] 10

#Question_6: Get the last element of hits_per_9innings.

hits_per_9innings[length(hits_per_9innings)]
## [1] 20

#You can also extract multiple values from a vector. For instance to get the 2nd through 4th values use

pitches_by_innings[c(2, 3, 4)]
## [1] 15 10 20
hits_per_9innings[c(2, 3, 4)]
## [1] 14  2  7

#Vectors can also be strings or logical values

player_positions <- c("catcher", "pitcher", "infielders", "outfielders")

Data Frames

#Data is often stored as a data frame, like a spreadsheet, with rows as observations and columns as variables. To manually create a data frame, use the data.frame() function

data.frame(bonus = c(2, 3, 1),#in millions 
           active_roster = c("yes", "no", "yes"), 
           salary = c(1.5, 2.5, 1))#in millions 

How to Make a Random Sample

#To randomly select a sample use the function sample(). The following code selects 5 numbers between 1 and 10 at random (without duplication)

sample(1:10, size=5)
## [1] 2 9 1 3 4

#The first argument gives the vector of data to select elements from. The second argument (size=) gives the size of the sample to select.

#Use sample() to select a sample of size n from a vector of the row numbers of the data frame. #Use the index operator [ to select those rows from the data frame.

bar <- data.frame(var1 = LETTERS[1:10], var2 = 1:10)
#Check Data Frame
bar

#Suppose you want to select a random sample of size 5. First, define a variable n with the size of the sample, i.e. 5

n <- 5

#Now, select a sample of size 5 from the vector with 1 to 10 (the number of rows in bar). Use the function nrow() to find the number of rows in bar instead of manually entering that number.

#Use : to create a vector with all the integers between 1 and the number of rows in bar.

samplerows <- sample(1:nrow(bar), size=n) 
#print sample rows
samplerows
## [1] 8 4 7 2 6

#The variable samplerows contains the rows of bar which make a random sample from all the rows in bar.

#extract rows
barsample <- bar[samplerows, ]
#print sample
print(barsample)
##   var1 var2
## 8    H    8
## 4    D    4
## 7    G    7
## 2    B    2
## 6    F    6

#The code above creates a new data frame called barsample with a random sample of rows from bar

bar[sample(1:nrow(bar), n), ]

Using Tables

#The table() command allows us to look at tables. Its simplest usage looks like table(x) where x is a categorical variable.We can enter this into R with the c() command, and summarize with the table() command as follows

x <- c("Yes","No","No","Yes","Yes") 
table(x)
## x
##  No Yes 
##   2   3

Numerical measures of center and spread

#Suppose, MLB Teams’ CEOs yearly compensations are sampled and the following are found (in millions), 12 .4 5 2 50 8 3 1 4 0.25

sals <- c(12, .4, 5, 2, 50, 8, 3, 1, 4, 0.25)
# the average
mean(sals)
## [1] 8.565
# the variance
var(sals)
## [1] 225.5145
# the standard deviation
sd(sals)
## [1] 15.01714
# the median
median(sals)
## [1] 3.5
# Tukey's five number summary, usefull for boxplots
# five numbers: min, lower hinge, median, upper hinge, max
fivenum(sals)
## [1]  0.25  1.00  3.50  8.00 50.00
# summary statistics
summary(sals)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.250   1.250   3.500   8.565   7.250  50.000

How about the mode?

#In R we can write our own functions, and a first example of a function is shown below in order to compute the mode of a vector of observations x

# Function to find the mode, i.e. most frequent value
getMode <- function(x) {
     ux <- unique(x)
     ux[which.max(tabulate(match(x, ux)))]
 }

#As an example, we can use the function defined above to find the most frequent value of the number of pitches_by_innings

# Most frequent value in pitches_by_innings
getMode(pitches_by_innings)
## [1] 10

#Question_7: Find the most frequent value of hits_per_9innings.

#most frequent value of hits_per_9innings
getMode(hits_per_9innings)
## [1] 8

#Question_8: Summarize the following survey with the table() command: What is your favorite day of the week to watch baseball? A total of 10 fans submitted this survey. #Saturday, Saturday, Sunday, Monday, Saturday,Tuesday, Sunday, Friday, Friday, Monday

game_day<-c("Saturday", "Saturday", "Sunday", "Monday", "Saturday","Tuesday", "Sunday", "Friday", "Friday", "Monday")

table(game_day)
## game_day
##   Friday   Monday Saturday   Sunday  Tuesday 
##        2        2        3        2        1

#Question_9: What is the most frequent answer recorded in the survey? Use the getMode function to compute results.

getMode(game_day)
## [1] "Saturday"