January 10, 2018

Q7: BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.

library(readr)
library(RCurl)
## Loading required package: bitops
# Choosing the Titanic dataset as it has large number of observations 
# I uploaded the data set to github and used the github https link to access the data

fileURL <- "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/Titanic.csv"
titanicDF <- read.csv(text = getURL(fileURL), header = TRUE, sep = ",")

# print basic info to make sure data shows correctly
head(titanicDF)
##   X                                          Name PClass   Age    Sex
## 1 1                  Allen, Miss Elisabeth Walton    1st 29.00 female
## 2 2                   Allison, Miss Helen Loraine    1st  2.00 female
## 3 3           Allison, Mr Hudson Joshua Creighton    1st 30.00   male
## 4 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st 25.00 female
## 5 5                 Allison, Master Hudson Trevor    1st  0.92   male
## 6 6                            Anderson, Mr Harry    1st 47.00   male
##   Survived SexCode
## 1        1       1
## 2        0       1
## 3        0       0
## 4        0       1
## 5        1       0
## 6        1       0
nrow(titanicDF)
## [1] 1313

Q1: Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes

# Summary of entire dataset
summary(titanicDF)
##        X                                  Name      PClass   
##  Min.   :   1   Carlsson, Mr Frans Olof     :   2   *  :  1  
##  1st Qu.: 329   Connolly, Miss Kate         :   2   1st:322  
##  Median : 657   Kelly, Mr James             :   2   2nd:279  
##  Mean   : 657   Abbing, Mr Anthony          :   1   3rd:711  
##  3rd Qu.: 985   Abbott, Master Eugene Joseph:   1            
##  Max.   :1313   Abbott, Mr Rossmore Edward  :   1            
##                 (Other)                     :1304            
##       Age            Sex         Survived         SexCode      
##  Min.   : 0.17   female:462   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:21.00   male  :851   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :28.00                Median :0.0000   Median :0.0000  
##  Mean   :30.40                Mean   :0.3427   Mean   :0.3519  
##  3rd Qu.:39.00                3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :71.00                Max.   :1.0000   Max.   :1.0000  
##  NA's   :557
# NOTE: I decided to cleanup my data as the data contained NA for many values
# Cleanup the data by removing NA values for calculating better mean and median values
# Use complete.cases() to get rows which do not have NA values 
titanicCleanDF1 <- titanicDF[complete.cases(titanicDF), ]

nrow(titanicDF)
## [1] 1313
nrow(titanicCleanDF1)
## [1] 756
# mean and median of Age of people on the ship
mean(titanicCleanDF1$Age)
## [1] 30.39799
median(titanicCleanDF1$Age)
## [1] 28
# median of people who survived or not
mean(titanicCleanDF1$Survived)
## [1] 0.4140212
median(titanicCleanDF1$Survived)
## [1] 0
# Save the mean and median values for original dataframe
meanAgeOrg = mean(titanicCleanDF1$Age)
medianAgeOrg = median(titanicCleanDF1$Age)
meanSurvivedOrg = mean(titanicCleanDF1$Survived)
medianSurvivedOrg = median(titanicCleanDF1$Survived)

Q2: Create a new data frame with a subset of the columns and rows. Make sure to rename it.

# Create  Subset by selecting people who survived on Titanic
titanicFemaleDF <-  subset(titanicCleanDF1, titanicCleanDF1$Sex=="female", select = c(Name,PClass,Age,Sex,Survived))
nrow(titanicFemaleDF)
## [1] 288

Q3: Create new column names for the new data frame

# Create new column names
names(titanicFemaleDF) <- c("FullName1", "Class1", "Age1", "Sex1", "Survived1")
head(titanicFemaleDF)
##                                           FullName1 Class1 Age1   Sex1
## 1                      Allen, Miss Elisabeth Walton    1st   29 female
## 2                       Allison, Miss Helen Loraine    1st    2 female
## 4     Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st   25 female
## 7                  Andrews, Miss Kornelia Theodosia    1st   63 female
## 9      Appleton, Mrs Edward Dale (Charlotte Lamson)    1st   58 female
## 12 Astor, Mrs John Jacob (Madeleine Talmadge Force)    1st   19 female
##    Survived1
## 1          1
## 2          0
## 4          0
## 7          1
## 9          1
## 12         1

Q4: Use the summary function to create an overview of your new data frame. The print the mean and median for the same two attributes. Please compare.

#summary
summary(titanicFemaleDF)
##                              FullName1   Class1         Age1      
##  Connolly, Miss Kate              :  2   *  :  0   Min.   : 0.17  
##  Abbott, Mrs Stanton (Rosa)       :  1   1st:101   1st Qu.:19.00  
##  Abelseth, Miss Anna Karen        :  1   2nd: 85   Median :27.00  
##  Abelson, Mrs Samuel (Anna)       :  1   3rd:102   Mean   :29.40  
##  Abraham, Mrs Joseph (Sophie Easu):  1             3rd Qu.:39.00  
##  Ahlin, Mrs Johanna Persdotter    :  1             Max.   :69.00  
##  (Other)                          :281                            
##      Sex1       Survived1     
##  female:288   Min.   :0.0000  
##  male  :  0   1st Qu.:1.0000  
##               Median :1.0000  
##               Mean   :0.7535  
##               3rd Qu.:1.0000  
##               Max.   :1.0000  
## 
#mean and median
mean(titanicFemaleDF$Age1)
## [1] 29.39642
median(titanicFemaleDF$Age1)
## [1] 27
mean(titanicFemaleDF$Survived1)
## [1] 0.7534722
median(titanicFemaleDF$Survived1)
## [1] 1
# Save the mean and median values for new dataframe
meanAgeNew = mean(titanicFemaleDF$Age1)
medianAgeNew = median(titanicFemaleDF$Age1)
meanSurvivedNew = mean(titanicFemaleDF$Survived1)
medianSurvivedNew = median(titanicFemaleDF$Survived1)


# compare the mean and medians
all <- c(meanAgeOrg, medianAgeOrg, meanSurvivedOrg, medianSurvivedOrg)
females <- c(meanAgeNew,medianAgeNew, meanSurvivedNew, medianSurvivedNew)
compare <- data.frame(all,females)
rownames(compare) <- c("Mean Age","Median Age","Mean Survived","Median Survived")
compare
##                        all    females
## Mean Age        30.3979894 29.3964236
## Median Age      28.0000000 27.0000000
## Mean Survived    0.4140212  0.7534722
## Median Survived  0.0000000  1.0000000
# Interesting to see that most females survived!! :)

Q5: For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter "e" in one column. Rename those values so that all 20 would show as "excellent".

# class of titanicAliveDF$Class1 is factor and its easier to convert them to character and change
# change Class1 from "1st" to "First"
titanicFemaleDF$Class1 <- as.character(titanicFemaleDF$Class1)
titanicFemaleDF$Class1[titanicFemaleDF$Class1 == "1st"] = "First"
titanicFemaleDF$Class1 <- as.factor(titanicFemaleDF$Class1)

# change Class1 from "2nd" to "Second"
titanicFemaleDF$Class1 <- as.character(titanicFemaleDF$Class1)
titanicFemaleDF$Class1[titanicFemaleDF$Class1 == "2nd"] = "Second"
titanicFemaleDF$Class1 <- as.factor(titanicFemaleDF$Class1)

# change Class1 from "3rd" to "Third"
titanicFemaleDF$Class1 <- as.character(titanicFemaleDF$Class1)
titanicFemaleDF$Class1[titanicFemaleDF$Class1 == "3rd"] = "Third"
titanicFemaleDF$Class1 <- as.factor(titanicFemaleDF$Class1)

#Print summary
summary(titanicFemaleDF)
##                              FullName1      Class1         Age1      
##  Connolly, Miss Kate              :  2   First :101   Min.   : 0.17  
##  Abbott, Mrs Stanton (Rosa)       :  1   Second: 85   1st Qu.:19.00  
##  Abelseth, Miss Anna Karen        :  1   Third :102   Median :27.00  
##  Abelson, Mrs Samuel (Anna)       :  1                Mean   :29.40  
##  Abraham, Mrs Joseph (Sophie Easu):  1                3rd Qu.:39.00  
##  Ahlin, Mrs Johanna Persdotter    :  1                Max.   :69.00  
##  (Other)                          :281                               
##      Sex1       Survived1     
##  female:288   Min.   :0.0000  
##  male  :  0   1st Qu.:1.0000  
##               Median :1.0000  
##               Mean   :0.7535  
##               3rd Qu.:1.0000  
##               Max.   :1.0000  
## 

Q6: Display enough rows to see examples of all of steps 1-5 above.

#Print df head 10 rows
head(titanicFemaleDF,10)
##                                           FullName1 Class1 Age1   Sex1
## 1                      Allen, Miss Elisabeth Walton  First   29 female
## 2                       Allison, Miss Helen Loraine  First    2 female
## 4     Allison, Mrs Hudson JC (Bessie Waldo Daniels)  First   25 female
## 7                  Andrews, Miss Kornelia Theodosia  First   63 female
## 9      Appleton, Mrs Edward Dale (Charlotte Lamson)  First   58 female
## 12 Astor, Mrs John Jacob (Madeleine Talmadge Force)  First   19 female
## 16   Baxter, Mrs James (Helene DeLaudeniere Chaput)  First   50 female
## 20  Beckwith, Mrs Richard Leonard (Sallie Monypeny)  First   47 female
## 24           Bishop, Mrs Dickinson H (Helen Walton)  First   19 female
## 28                           Bonnell, Miss Caroline  First   30 female
##    Survived1
## 1          1
## 2          0
## 4          0
## 7          1
## 9          1
## 12         1
## 16         1
## 20         1
## 24         1
## 28         1
#Print df middle to middle+10 
middle <- round(nrow(titanicFemaleDF)/2)
titanicFemaleDF[middle:(middle+9),]
##                                FullName1 Class1 Age1   Sex1 Survived1
## 466 Karnes, Mrs J Frank (Claire Bennett) Second   22 female         0
## 469         Kelly, Mrs Florence (Fannie) Second   45 female         1
## 474  Lahtinen, Mrs William (Anna Sylvan) Second   26 female         0
## 476                   Lemore, Mrs Amelia Second   34 female         1
## 478         LaRoche, Mrs Joseph (Juliet) Second   22 female         1
## 479                 LaRoche, Miss Louise Second    1 female         1
## 480                LaRoche, Miss Simonne Second    3 female         1
## 488                       Mack, Mrs Mary Second   57 female         0
## 495   Marshall, Mrs Kate Louise Phillips Second   19 female         1
## 501        Mellenger, Mrs Elizabeth Anne Second   41 female         1
#Print df head 
tail(titanicFemaleDF,10)
##                                FullName1 Class1 Age1   Sex1 Survived1
## 1189      Sandstrom, Miss Beatrice Irene  Third  1.5 female         0
## 1264              Turja, Miss Anna Sofia  Third 18.0 female         1
## 1265                 Turkula, Mrs Hedvig  Third 63.0 female         1
## 1270        Van der Planke, Miss Augusta  Third 18.0 female         0
## 1272           Van der Planke, Mrs Jules  Third 31.0 female         0
## 1277            Van Impe, Miss Catharine  Third 10.0 female         0
## 1279         Van Impe, Mrs Jean Baptiste  Third 30.0 female         0
## 1284 Vestrom, Miss Hulda Amanda Adolfina  Third 14.0 female         0
## 1294                   Wilkes, Mrs Ellen  Third 45.0 female         1
## 1305                 Yasbeck, Mrs Antoni  Third 15.0 female         1