R Notebook

# Create vectors, named and with values below:

presid_name= c("Obama","Bush","Bush","Clinton","Clinton","Bush Father","Reagan","Reagan","Carter","Nixon","Nixon","Johnson","Kennedy","Eisenhower","Eisenhower","Truman")

winner = c(185, 182, 182, 188, 188, 188, 185, 185, 177, 182, 182, 193, 183, 179, 179, 175)

opponent = c(175, 193, 185, 187, 188, 173, 180, 177, 183, 185, 180, 180, 182, 178, 178, 173)

# create another vector called year
year= seq (from= 2008, to= 1948, by=-4)
#by- allowes the datset to be continued every 4 years

#create another vector called "isWinnerTaller" that takes the winners greater than opponents.
isWinnerTaller= winner > opponent
isWinnerTaller

##  [1]  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE  TRUE  TRUE  TRUE

Count how many times the winner has been taller than the opponent.

#Answer the Q.- How many times has the winner been taller than the oppnent?
sum(isWinnerTaller)

## [1] 11

#Create a Dataframe that contains all the vectors we just created, then call it.
df_presidents <- data.frame(presid_name, year, winner, opponent, isWinnerTaller)
df_presidents

#Get the 1st 6 rows of data
head(df_presidents)

str (df_presidents)

## 'data.frame':    16 obs. of  5 variables:
##  $ presid_name   : chr  "Obama" "Bush" "Bush" "Clinton" ...
##  $ year          : num  2008 2004 2000 1996 1992 ...
##  $ winner        : num  185 182 182 188 188 188 185 185 177 182 ...
##  $ opponent      : num  175 193 185 187 188 173 180 177 183 185 ...
##  $ isWinnerTaller: logi  TRUE FALSE FALSE TRUE FALSE TRUE ...

#gives you the data information of everything that is there.

colnames(df_presidents)

## [1] "presid_name"    "year"           "winner"         "opponent"      
## [5] "isWinnerTaller"

#Assigning/ associating a name to a column header to each column

ncol(df_presidents)

## [1] 5

nrow(df_presidents)

## [1] 16

#Counts/ returns the # of columns, and the # of records. [x]
#Counts/ returns the # of rows, and the # of records. [x]

#How to add a new column to an existing data frame., need to use the ($) function.
df_presidents$difference = winner - opponent
df_presidents

#deleting the 6th column, need to use the [,] the 2nd part of the brackets.
#[a,b]
#a- refers to a row
#b- refers to a column
#(-) deletes
df_presidents [ ,-6]

df_presidents

#Another way of deleting column: 
"(!=) <- refers to 'not equal to'"

## [1] "(!=) <- refers to 'not equal to'"

df_presidents[ , colnames(df_presidents) != 'difference' ]

#Creates a new data frame with the features you just did.
df_presidents <- df_presidents[ , colnames(df_presidents) != 'difference' ]
df_presidents

#Indexing a Data Frame:

#Refer to the 2 column results in a data frame.
df_presidents[ , 2]

##  [1] 2008 2004 2000 1996 1992 1988 1984 1980 1976 1972 1968 1964 1960 1956 1952
## [16] 1948

# alternative 2: df_presidents [, 'year']

# alternative 3: df_presidents$year

#look at the 1st row results in dataframe
df_presidents[1, ]

#Get the Last Column values.
df_presidents[ , ncol(df_presidents)]

##  [1]  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
## [13]  TRUE  TRUE  TRUE  TRUE

# alternative: df_presidents[ , 5]

#From the 1st 3 records/rows, concatenate the 3&4 columns
df_presidents [c(1,2,3), c(3,4)]

#Some useful functions to work with data frames

#Use the subset() function to get the rows from the data frame where the Winner > Opponent, use 'TRUE'  to refer to when the values occur.
subset (df_presidents, df_presidents$isWinnerTaller==TRUE)

#Get the winners’ names only for cases when Winner > Opponent
subset (df_presidents$presid_name, df_presidents$isWinnerTaller==TRUE)

##  [1] "Obama"       "Clinton"     "Bush Father" "Reagan"      "Reagan"     
##  [6] "Nixon"       "Johnson"     "Kennedy"     "Eisenhower"  "Eisenhower" 
## [11] "Truman"

#Other useful functions are apply() and tapply(). To be able to practice these functions, lets’ add a column with the party of the winner (i.e., the party of the president).

party= c("Dem", "Rep", "Rep", "Dem", "Dem", "Rep", "Rep", "Rep", "Dem","Rep","Rep","Dem","Dem","Rep","Rep","Dem")

# add a column with the party of the winner (i.e., the party of the president).
#Add a column to df_presidents called "presid_party" using the vector called "party"
df_presidents$presid_party= party
df_presidents

#tapply() ---- applies a function or operation on subset of the vector broken down by a given factor variable.
#Use tapply() to compute the mean height for the presidents from each party
tapply (df_presidents$winner, df_presidents$presid_party, mean)

##      Dem      Rep 
## 184.1429 182.6667

#Use tapply() to compute the max height for the presidents from each party.
tapply (df_presidents$winner, df_presidents$presid_party, max)

## Dem Rep 
## 193 188

#The apply() ---- function lets us apply a function to the rows or columns of a matrix or data frame.
#Use apply() to compute the mean height for the winners and opponents
apply (df_presidents [, c("winner", "opponent")] ,2, mean)

##   winner opponent 
## 183.3125 181.0625

# The 2 means that we want to apply the mean by columns

#Another way of doing it:
colMeans (df_presidents [, c("winner", "opponent")])

##   winner opponent 
## 183.3125 181.0625

#SD for winner and opponent
apply (df_presidents [, c("winner", "opponent")] , 2, sd)

##   winner opponent 
## 4.629165 5.579352

# A higher SD means that there is more variation, so it has more daata to be different.