names <- c("Sophie", "Hanh", "Hannah", "Emily", "Thea")
names
## [1] "Sophie" "Hanh" "Hannah" "Emily" "Thea"
colors <- c("purple", "red", "blue", "green", "pink")
colors
## [1] "purple" "red" "blue" "green" "pink"
ages <- c(42, 55, 34, 21, 54)
ages
## [1] 42 55 34 21 54
length(names)
## [1] 5
length(colors)
## [1] 5
length(ages)
## [1] 5
instructors <- data.frame(names,colors,ages)
instructors
## names colors ages
## 1 Sophie purple 42
## 2 Hanh red 55
## 3 Hannah blue 34
## 4 Emily green 21
## 5 Thea pink 54
data-frame-name$column-name# access column names in instructors dataframe
instructors$names
## [1] "Sophie" "Hanh" "Hannah" "Emily" "Thea"
read.csv() - import csv file data
titanic <- read.csv("https://raw.githubusercontent.com/zeigna/AppliedStatsAnalysis/master/titanic.csv")
View(titanic)
head() - generate the first 6 rows of the dataset
head(titanic)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
# first 2 rows
head(titanic, n=2)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
str() - check for the structure
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
We noticed that Survived, Pclass, Sex, and Embarked are QuaLitative variables which should be factored.
as.factor() - convert vector into factor
titanic$Survived <- as.factor(titanic$Survived)
titanic$Pclass <- as.factor(titanic$Pclass)
titanic$Sex <- as.factor(titanic$Sex)
titanic$Embarked <- as.factor(titanic$Embarked)
Check the change using str()
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
mean()
mean(titanic$Age , na.rm = TRUE)
## [1] 29.69912
var()
var(titanic$Age, na.rm = TRUE)
## [1] 211.0191
sd()
sd(titanic$Age, na.rm = TRUE)
## [1] 14.5265
max()
max(titanic$Age, na.rm = TRUE)
## [1] 80
min()
min(titanic$Age, na.rm = TRUE)
## [1] 0.42
colors()
barplot( table(titanic$Sex) ,
main = "Passenger Counts for Different Genders",
ylab = "Passenger Counts",
xlab = "Gender",
col = "skyblue",
border = "salmon",
ylim = c(0,700),
space = 2)
hist(titanic$Age,
main = "Histogram of Passenger Age",
ylab = "Passenger Counts",
xlab = "Passenger Age",
col = "lavender",
border = "purple",
ylim = c(0,300))
boxplot(titanic$Age,
ylab = "Passenger Age",
col = "lavender",
border = "purple",
main = "Boxplot of Passenger Age on the Titanic")