#First Steps with R activity #Basic calculations

#Additon
2+3
## [1] 5
#Division
2/3
## [1] 0.6666667
# Exponentiation
2^3
## [1] 8
#square root
sqrt(100)
## [1] 10
#Logarithms
log(2)
## [1] 0.6931472

#Computing some offensive metrics in Baseball

#Batting Average=(No. of Hits)/(No. of At Bats)

#What is the batting average of a player that bats 29 hits in 112 at bats?
bats= 29
total_bats= 112

ans= (bats)/(total_bats)
ans
## [1] 0.2589286
#round the batting average to 3 decimal places
Batting_average=round(ans, digits = 3)
Batting_average
## [1] 0.259
#Question_2:What is the batting average of a player that bats 42 hits in 212 at bats?
bats2= 42

totalbats2= 212

ans2= (bats2)/(totalbats2)
ans2
## [1] 0.1981132
#On Base Percentage
#OBP=(H+BB+HBP)/(At Bats+H+BB+HBP+SF)
#Let us compute the OBP for a player with the following general stats:

AB=515
H=172
BB=84
HBP=5
SF=6

OBP= (H+BB+HBP)/(AB+H+BB+HBP+SF)
OBP
## [1] 0.3337596
#round the OBP 3 decimal places
round(OBP, digits = 3)
## [1] 0.334
#Question_3:Compute the OBP for a player with the following general stats:

AB=565
H=156
BB=65
HBP=3
SF=7

OBP2= (H+BB+HBP)/(AB+H+BB+HBP+SF)
OBP2
## [1] 0.281407

#Often you will want to test whether something is less than, greater than or equal to something.

3 == 8 # Does 3 equals 8? == (refers to being equivalen terms or numbers)
## [1] FALSE
3 != 8# Is 3 different from 8? (!= is not equal)
## [1] TRUE
#Less than or equal
# Is 2 less than or equal to 0?
2<=0
## [1] FALSE
#greater than
3>4
## [1] FALSE

#The logical operators are & for logical AND, | for logical OR, and ! for NOT. These are some examples:

#use & first
3&3
## [1] TRUE
#now use OR
FALSE | FALSE
## [1] FALSE
# Not true is the same as false
!TRUE
## [1] FALSE
#True and false will always equal false
TRUE & FALSE
## [1] FALSE
#negation is the same as saying not
!TRUE
## [1] FALSE
# Combination of statements
2 < 3 | 1==5 #the first statement 2<3 is TRUE, the second is 1=5 which is false, so we say: TRUE or FALSE = TRUE
## [1] TRUE

#Assigning Values to Variables

Total_Bases <- 6+5
Total_Bases*3
## [1] 33
#To see the variables that are currently defined, use ls (as in “list”)
ls()
##  [1] "AB"              "ans"             "ans2"            "bats"           
##  [5] "bats2"           "Batting_average" "BB"              "H"              
##  [9] "HBP"             "OBP"             "OBP2"            "SF"             
## [13] "Total_Bases"     "total_bats"      "totalbats2"
#remove values from the list
rm(Total_Bases)
#see the change
ls()
##  [1] "AB"              "ans"             "ans2"            "bats"           
##  [5] "bats2"           "Batting_average" "BB"              "H"              
##  [9] "HBP"             "OBP"             "OBP2"            "SF"             
## [13] "total_bats"      "totalbats2"

#Vectors: The basic type of object in R is a vector, which is an ordered list of values of the same type. You can create a vector using the c() function (as in “concatenate”).

#create a vector called "pitches_by_innings" and assign 5 random values
pitches_by_innings <- c(12,15,10,20,10)
pitches_by_innings
## [1] 12 15 10 20 10
#create a vector called "strikes_by_innings" and assign 5 random values
strikes_by_innings <- c(9,12,6,14,9)
strikes_by_innings
## [1]  9 12  6 14  9
#Question_4: Define two vectors,runs_per_9innings and hits_per_9innings, each with five elements. 
runs_per_9innings <- c(10,5,4,18,22)
hits_per_9innings <- c(9,13,1,7,20)
#show the first new vector
runs_per_9innings
## [1] 10  5  4 18 22
#show the second new vector
hits_per_9innings
## [1]  9 13  1  7 20

#There are also some functions that will create vectors with regular patterns, like repeated elements.

#replicate function
rep(2,10) #creates a sequence of 10 2's
##  [1] 2 2 2 2 2 2 2 2 2 2
# replicate the number 1, 4 times
rep(1,4)
## [1] 1 1 1 1
#create a list of consecutive numbers from 1-5
1:5
## [1] 1 2 3 4 5
#create a sequence of consecutive numbers from 2-10
2:10
## [1]  2  3  4  5  6  7  8  9 10
# sequence from 1 to 10 with a step of 2
seq(1,10, by=2)
## [1] 1 3 5 7 9
# sequence from 2 to 13 with a step of 3
seq(2,13, by=3)
## [1]  2  5  8 11

#Many functions and operators like + or - will work on all elements of the vector.

# add vectors
pitches_by_innings+strikes_by_innings
## [1] 21 27 16 34 19
# compare vectors
#this compares the numbers inside of the vectors one-by-one and in order
pitches_by_innings == strikes_by_innings
## [1] FALSE FALSE FALSE FALSE FALSE
#if i create 2 more vectors with the same numbers, it should show TRUE for all values

X <- c(10,12)
Z <- c(10,12)

X == Z 
## [1] TRUE TRUE
#Find the length of vectors
length(pitches_by_innings) #this refers to the number of elements in the variable
## [1] 5
# find minimum value in vector
min(pitches_by_innings)
## [1] 10
#find the average value in the vector
mean(pitches_by_innings)
## [1] 13.4

#You can access parts of a vector by using [. Recall what the value is of the vector pitches_by_innings.

pitches_by_innings
## [1] 12 15 10 20 10
#get the first element
pitches_by_innings[1]
## [1] 12
#Question_5: Get the first element of hits_per_9innings.
hits_per_9innings[5]
## [1] 20

#If you want to get the last element of pitches_by_innings without explicitly typing the number of elements of pitches_by_innings, make use of the length function, which calculates the length of a vector

pitches_by_innings[length(pitches_by_innings)] #this will print the last value in the vector
## [1] 10
#Question_6: Get the last element of hits_per_9innings.
hits_per_9innings[length(hits_per_9innings)]
## [1] 20

#You can also extract multiple values from a vector. For instance to get the 2nd through 4th values use

pitches_by_innings[c(2,3,4)]
## [1] 15 10 20

#Vectors can also be strings or logical values

player_positions <- c("catcher", "pitcher", "infielders", "outfielders")

#Data Frames:In statistical applications, data is often stored as a data frame, which is like a spreadsheet, with rows as observations and columns as variables.

#To manually create a data frame, use the data.frame() function.

#the data is in millions
data.frame(bonus = c(2,3,1),
           active_roster = c("yes", "no", "yes"),
           salary = c(1.5, 2.5, 1))

###Most often you will be using data frames loaded from a file. For example, load the results of a fan’s survey. The function load or read.table can be used for this.

#How to Make a Random Sample

#To randomly select a sample use the function sample(). The following code selects 5 numbers between 1 and 10 at random (without duplication)
sample(1:10, size=5)
## [1]  2  4  8  9 10
#The first argument gives the vector of data to select elements from.
#The second argument (size=) gives the size of the sample to select.
#Taking a simple random sample from a data frame is only slightly more complicated, having two steps:

#Use sample() to select a sample of size n from a vector of the row numbers of the data frame.
#Use the index operator [ to select those rows from the data frame.
#Consider the following example with fake data. First, make up a data frame with two columns. (LETTERS is a character vector of length 26 with capital letters “A” to “Z”; LETTERS is automatically defined and pre-loaded in R)
bar <- data.frame(var1 = LETTERS[1:10], var2 = 1:10)
# Check data frame
bar

#Suppose you want to select a random sample of size 5. First, define a variable n with the size of the sample, i.e. 5

n <- 5

#Now, select a sample of size 5 from the vector with 1 to 10 (the number of rows in bar). Use the function nrow() to find the number of rows in bar instead of manually entering that number.

#Use : to create a vector with all the integers between 1 and the number of rows in bar.

samplerows <- sample(1:nrow(bar), size=n) 
# print sample rows
samplerows
## [1] 1 3 6 7 4

#The variable samplerows contains the rows of bar which make a random sample from all the rows in bar. Extract those rows from bar with

# extract rows
barsample <- bar[samplerows, ]

# print sample
print(barsample)
##   var1 var2
## 1    A    1
## 3    C    3
## 6    F    6
## 7    G    7
## 4    D    4

#The code above creates a new data frame called barsample with a random sample of rows from bar.

#In a single line of code:

bar[sample(1:nrow(bar), n), ]

#Using Tables ###The table() command allows us to look at tables. Its simplest usage looks like table(x) where x is a categorical variable.

###For example, a survey asks people if they support the home team or not. The data is

###Yes, No, No, Yes, Yes

###We can enter this into R with the c() command, and summarize with the table() command as follows

x <- c("Yes","No","No","Yes","Yes") 
table(x)
## x
##  No Yes 
##   2   3

#Numerical measures of center and spread ###Suppose, MLB Teams’ CEOs yearly compensations are sampled and the following are found (in millions)

###12 .4 5 2 50 8 3 1 4 0.25

#create a variable with the salaries in millions
sals <- c(12, .4, 5, 2, 50, 8, 3, 1, 4, 0.25)
# the average
mean(sals) 
## [1] 8.565
#get the variance of the salaries
var(sals)
## [1] 225.5145
#calculate the standard deviation
sd(sals)
## [1] 15.01714
#the median

median(sals)
## [1] 3.5
# Tukey's five number summary, useful for boxplots
# five numbers: min, lower hinge, median, upper hinge, max
fivenum(sals) 
## [1]  0.25  1.00  3.50  8.00 50.00
#get the summary of the statistics of the data
summary(sals)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.250   1.250   3.500   8.565   7.250  50.000

#How about the mode? ###In R we can write our own functions, and a first example of a function is shown below in order to compute the mode of a vector of observations x

# Function to find the mode, i.e. most frequent value in x
getMode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

#As an example, we can use the function defined above to find the most frequent value of the number of pitches_by_innings

#use the formula to get the mode
getMode(pitches_by_innings)
## [1] 10
#Question_7: Find the most frequent value of hits_per_9innings.
getMode(hits_per_9innings)
## [1] 9
#Question_8: Summarize the following survey with the `table()` command:

#What is your favorite day of the week to watch baseball? A total of 10 fans submitted this survey.

#Saturday, Saturday, Sunday, Monday, Saturday,Tuesday, Sunday, Friday, Friday, Monday
game_day<-c("Saturday", "Saturday", "Sunday", "Monday", "Saturday","Tuesday", "Sunday", "Friday", "Friday", "Monday")

table(game_day)
## game_day
##   Friday   Monday Saturday   Sunday  Tuesday 
##        2        2        3        2        1
#Question_9: What is the most frequent answer recorded in the survey? Use the getMode function to compute results. 
getMode(game_day)
## [1] "Saturday"