Review Data-frame

names <- c("Sophie", "Hanh", "Hannah", "Emily", "Thea")
names
## [1] "Sophie" "Hanh"   "Hannah" "Emily"  "Thea"
colors <- c("purple", "red", "blue", "green", "pink")
colors
## [1] "purple" "red"    "blue"   "green"  "pink"
ages <- c(42, 55, 34, 21, 54)
ages
## [1] 42 55 34 21 54
length(names)
## [1] 5
length(colors)
## [1] 5
length(ages)
## [1] 5
instructors <- data.frame(names,colors,ages)
instructors
##    names colors ages
## 1 Sophie purple   42
## 2   Hanh    red   55
## 3 Hannah   blue   34
## 4  Emily  green   21
## 5   Thea   pink   54

To access a column in a data-frame: data-frame-name$column-name

# access column names in instructors dataframe
instructors$names
## [1] "Sophie" "Hanh"   "Hannah" "Emily"  "Thea"

Real life Dataset: the Titanic

read.csv() - import csv file data

titanic <- read.csv("https://raw.githubusercontent.com/zeigna/AppliedStatsAnalysis/master/titanic.csv")

View(titanic)

head() - generate the first 6 rows of the dataset

head(titanic)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q
# first 2 rows
head(titanic, n=2)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
##      Ticket    Fare Cabin Embarked
## 1 A/5 21171  7.2500              S
## 2  PC 17599 71.2833   C85        C

str() - check for the structure

str(titanic)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

We noticed that Survived, Pclass, Sex, and Embarked are QuaLitative variables which should be factored.

as.factor() - convert vector into factor

titanic$Survived <- as.factor(titanic$Survived)
titanic$Pclass <- as.factor(titanic$Pclass)
titanic$Sex <- as.factor(titanic$Sex)
titanic$Embarked <- as.factor(titanic$Embarked)

Check the change using str()

str(titanic)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...

mean()

mean(titanic$Age , na.rm = TRUE)
## [1] 29.69912

var()

var(titanic$Age, na.rm = TRUE)
## [1] 211.0191

sd()

sd(titanic$Age, na.rm = TRUE)
## [1] 14.5265

max()

max(titanic$Age, na.rm = TRUE)
## [1] 80

min()

min(titanic$Age, na.rm = TRUE)
## [1] 0.42

Visualization

colors()

Bargraph

barplot( table(titanic$Sex) ,
         main = "Passenger Counts for Different Genders",
         ylab = "Passenger Counts",
         xlab = "Gender",
         col = "skyblue",
         border = "salmon",
         ylim = c(0,700),
         space = 2)

Histogram

hist(titanic$Age,
     main =  "Histogram of Passenger Age",
     ylab = "Passenger Counts",
     xlab = "Passenger Age",
     col = "lavender",
     border = "purple",
     ylim = c(0,300))

Boxplot

boxplot(titanic$Age,
        ylab = "Passenger Age",
        col = "lavender",
        border = "purple",
        main = "Boxplot of Passenger Age on the Titanic")