MATH2405 Assessment 1

Setup

library(readr)
library(stringr)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(forcats)
library(forestmangr)
options(warn=-1)

Read/Import Data

#assign address to url variable
url <- "http://data.gov.au/data/dataset/6d7bacba-8e0f-4105-b7bc-9fe11eda9f89/resource/8d3e12fa-a9fd-4268-9164-6240a97ff44d/download/convicts.csv"

#Read the data with only relevant columns specified

convicts <- read_csv(url) [,c(6,8:12)]

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   CON_IDX = col_double(),
##   POLICE_NO = col_double(),
##   EXTRA_ID = col_double(),
##   CHILD_FATHER = col_logical()
## )
## i Use `spec()` for the full column specifications.

#Review parsing errors

problems(convicts)

#view first few rows of imported dataset

head(convicts)

Steps taken to Import Data

The address for the online csv file was assigned to a variable named ‘url’ to make it easier to read in.
The readr function ‘read_csv’ was used to read in the data as it is purported to be 10 times faster than equivalent base functions. The data was saved as a data frame called ‘convicts’.
Rather than importing the whole dataset which contained a lot of superfluous information, only relevant variables were selected to read in.
There was a parsing error when loading in the data. By using the ‘problems’ function I was able to see that the error was due to a difference in the expected v actual number of columns. This has no impact on my planned analysis so can be disregarded.
After importing the data, the ‘head’ function was used to view the first few rows of data.

Data description

The Tasmanian Convicts (1803-1893) dataset was sourced from data.gov.au at the following url: http://data.gov.au/data/dataset/6d7bacba-8e0f-4105-b7bc-9fe11eda9f89/resource/8d3e12fa-a9fd-4268-9164-6240a97ff44d/download/convicts.csv

It contains over 80,000 records of convicts who were either transported to Tasmania or convicted locally between 1803-1893.

Variable Descriptions:

VOYAGE_NO: Sequential numbering of voyages, ordered chronologically.
DEPARTURE_DATE: The date the ship left the port of departure
SHIP: The name of the ship, including the trip number if there were multiple voyages.
NAME: The name of the convict transported.Some observations contain multiple convicts separated by a ‘|’.
YEAR: The year/s of the voyage. If the journey extended across two years, the observation contains two years separated by a ‘|’.
DEPARTURE PORT: Shows the port of departure.

Inspect dataset and variables

#number of rows

nrow(convicts)

## [1] 81507

#number of columns

ncol(convicts)

## [1] 6

#alternately, see both rows and columns with dim() function

dim(convicts)

## [1] 81507     6

#Check the data types - currently all are Character type

sapply(convicts, class)

##      VOYAGE_NO DEPARTURE_DATE           SHIP           NAME           YEAR 
##    "character"    "character"    "character"    "character"    "character" 
## DEPARTURE_PORT 
##    "character"

#Change column names from uppercase to proper

colnames(convicts)<-str_to_title(colnames(convicts))

#Reassign data types as appropriate 
#cannot reassign Year yet due to multiple values - to be addressed later

convicts$Voyage_no<-as.integer(convicts$Voyage_no)
convicts$Departure_date<-as.factor(convicts$Departure_date)
convicts$Ship<-as.factor(convicts$Ship)
convicts$Name<-as.factor(convicts$Name)
convicts$Departure_port<-as.factor(convicts$Departure_port)

#Review class amendments
sapply(convicts, class)

##      Voyage_no Departure_date           Ship           Name           Year 
##      "integer"       "factor"       "factor"       "factor"    "character" 
## Departure_port 
##       "factor"

head(convicts)

Tidy data

Reshape the dataset into a tidy format.

#Split out Departure_date column into Day, Month_name and Year (retain original column)

convicts$datesplit<-convicts$Departure_date

convicts<-convicts%>%
  separate(datesplit, c("Day", "Month_name", "Yr"), sep=" ")

#Reassign new variables to correct data types

convicts$Day<-as.integer(convicts$Day)
convicts$Yr<-as.integer(convicts$Yr)

# Order levels for Month Variable

convicts$Month_name<-factor(convicts$Month_name,
              levels=c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'),
              ordered=TRUE)

#Year column contains multiple values in some cells. Separate into separate columns.

convicts<-convicts %>% 
  separate(Year, into = c("Arrival_year", "Departure_year"))

#Replace missing values in Arrival Year with values in Departure Year. These are voyages that departed and arrived in the same year.

convicts$Departure_year<-ifelse(is.na(convicts$Departure_year), (convicts$Arrival_year), convicts$Departure_year)

#Reassign class to factor

convicts$Arrival_year<-as.factor(convicts$Arrival_year)
convicts$Departure_year<-as.factor(convicts$Departure_year)

head(convicts)

#Name column contains multiple values in some cells, separated by "|". Need to separate into separate columns. 

#First, create a new variable to count how many | are in each cell, then n+1 = number of convicts per entry

convicts$n<-stringi::stri_count(convicts$Name, fixed = "|")+1

#Filter out any rows with no convicts

convicts<-convicts %>% filter(!is.na(n))

#Can now work out the maximum number of names in any cell

max(convicts$n)

## [1] 12

#Separate names into separate columns - 12 columns needed as determined in previous step

convicts<-convicts%>%
   separate(Name, c("Name", "Name2", "Name3", "Name4", "Name5","Name6","Name7","Name8","Name9","Name10","Name11","Name12"), sep="[|]")

#create new data frames for each of the Name columns, filtered to only show rows with values

convicts1<-convicts %>% select(-(Name2:Name12)) %>% filter(!is.na(Name))
convicts2<-convicts %>% select(-(Name),-(Name3:Name12)) %>% filter(!is.na(Name2))
convicts3<-convicts %>% select(-(Name:Name2),-(Name4:Name12)) %>% filter(!is.na(Name3))
convicts4<-convicts %>% select(-(Name:Name3),-(Name5:Name12)) %>% filter(!is.na(Name4))
convicts5<-convicts %>% select(-(Name:Name4),-(Name6:Name12)) %>% filter(!is.na(Name5))
convicts6<-convicts %>% select(-(Name:Name5),-(Name7:Name12)) %>% filter(!is.na(Name6))
convicts7<-convicts %>% select(-(Name:Name6),-(Name8:Name12)) %>% filter(!is.na(Name7))
convicts8<-convicts %>% select(-(Name:Name7),-(Name9:Name12)) %>% filter(!is.na(Name8))
convicts9<-convicts %>% select(-(Name:Name8),-(Name10:Name12)) %>% filter(!is.na(Name9))
convicts10<-convicts %>% select(-(Name:Name9),-(Name11:Name12)) %>% filter(!is.na(Name10))
convicts11<-convicts %>% select(-(Name:Name10),-(Name12)) %>% filter(!is.na(Name11))
convicts12<-convicts %>% select(-(Name:Name11)) %>% filter(!is.na(Name12))

#Rename Name columns so all data frames have the same column names

names(convicts2)[names(convicts2)=='Name2']<-'Name'
names(convicts3)[names(convicts3)=='Name3']<-'Name'
names(convicts4)[names(convicts4)=='Name4']<-'Name'
names(convicts5)[names(convicts5)=='Name5']<-'Name'
names(convicts6)[names(convicts6)=='Name6']<-'Name'
names(convicts7)[names(convicts7)=='Name7']<-'Name'
names(convicts8)[names(convicts8)=='Name8']<-'Name'
names(convicts9)[names(convicts9)=='Name9']<-'Name'
names(convicts10)[names(convicts10)=='Name10']<-'Name'
names(convicts11)[names(convicts11)=='Name11']<-'Name'
names(convicts12)[names(convicts12)=='Name12']<-'Name'

#bind rows to merge all data frames back into one

convictsbind<-bind_rows(convicts1, convicts2, convicts3, convicts4, convicts5, convicts6, convicts7, convicts8, convicts9, convicts10, convicts11, convicts12)

#update the original data frame with the changes 

convicts<-convictsbind

#Now we can separate the Name column into Surname and First_name

convicts<-convicts%>%
  separate(Name, c("Surname", "First_name"), sep=",")

#filter out duplicate entries & other faulty data

convicts<-convicts[!duplicated(convicts),]
convicts<-convicts %>% filter(Ship!="1804|1803")

#reorder df to suit preference

convicts<-convicts %>% select (Voyage_no, Day, Month_name, Departure_date, Departure_year, Arrival_year, Departure_port, Ship, Surname, First_name)

head(convicts)

#Address Missing Values for observations with Voyage no 

#Create a dataframe that includes everything except convict names 

completeDF<-convicts %>% select(everything(), -Surname, -First_name) %>% arrange(Voyage_no)

#Reduce down to complete cases

completeDF<-completeDF[complete.cases(completeDF), ]

#Remove Duplicates

completeDF<-completeDF[!duplicated(completeDF),]

#Merge data where Voyage_no and Departure_date is not na

merge<-convicts %>% 
  filter(!is.na(Voyage_no))%>% 
  left_join(completeDF, by="Voyage_no") %>% 
  select(Voyage_no, Day.y, Month_name.y, Departure_date.y, Departure_year.y, Arrival_year.y, Departure_port.y, Ship.y, Surname, First_name)
merge<-merge %>% filter(!is.na(Ship.y))

#Rename columns

names(merge)[names(merge)=='Day.y']<-'Day'
names(merge)[names(merge)=='Month_name.y']<-'Month_name'
names(merge)[names(merge)=='Departure_date.y']<-'Departure_date'
names(merge)[names(merge)=='Departure_year.y']<-'Departure_year'
names(merge)[names(merge)=='Arrival_year.y']<-'Arrival_year'
names(merge)[names(merge)=='Departure_port.y']<-'Departure_port'
names(merge)[names(merge)=='Ship.y']<-'Ship'

#Check all NA values addressed

summary(merge)

##    Voyage_no          Day          Month_name        Departure_date 
##  Min.   :  1.0   Min.   : 1.00   Apr    :14183   14 Jan 1846:  895  
##  1st Qu.:118.0   1st Qu.: 8.00   Sep    : 9354   30 Apr 1848:  792  
##  Median :199.0   Median :15.00   May    : 8535   24 Nov 1852:  730  
##  Mean   :198.7   Mean   :15.66   Jul    : 8477   16 Apr 1843:  716  
##  3rd Qu.:288.0   3rd Qu.:24.00   Jun    : 8206   1 Oct 1843 :  708  
##  Max.   :371.0   Max.   :31.00   Aug    : 8024   2 Apr 1842 :  652  
##                                  (Other):35875   (Other)    :88161  
##  Departure_year   Arrival_year      Departure_port                Ship      
##  1842   : 6607   1844   : 7299   London    :31775   Joseph Somes (1):  895  
##  1844   : 6366   1842   : 7204   Portsmouth:10101   St Vincent      :  717  
##  1843   : 5546   1845   : 5459   Plymouth  : 9811   John Calvin     :  716  
##  1845   : 5109   1843   : 5180   Dublin    : 6736   Anson           :  708  
##  1852   : 4904   1841   : 4347   Downs     : 6227   Waverley (3)    :  590  
##  1841   : 4505   1850   : 4219   Woolwich  : 5880   Sarah           :  558  
##  (Other):59617   (Other):58946   (Other)   :22124   (Other)         :88470  
##    Surname           First_name       
##  Length:92654       Length:92654      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##

#summarise dataset by number of convicts on each ship

tidydf<-merge %>% 
  group_by(Voyage_no, Day, Month_name, Departure_date, Departure_year, Arrival_year, Departure_port, Ship) %>% 
  summarise(No_convicts=n()) %>% 
  arrange(desc(No_convicts))

## `summarise()` has grouped output by 'Voyage_no', 'Day', 'Month_name', 'Departure_date', 'Departure_year', 'Arrival_year', 'Departure_port'. You can override using the `.groups` argument.

head(tidydf)

Summary statistics

Provide summary statistics (mean, median, minimum, maximum, standard deviation) of total number of convicts (numerical) grouped by year of departure (categorical)

sumstats<-tidydf %>% 
  group_by(Departure_year) %>% 
  summarise(mean = mean(No_convicts), 
            median = median(No_convicts), 
            min = min(No_convicts), 
            max = max(No_convicts), 
            sd = sd(No_convicts))

#use forestmangr package to round all df values

round_df(sumstats)

Create a list

#create list of Month name as a numeric value

listNum<-list(as.numeric(tidydf$Month_name))

Join the list to original data frame

#convert the list to data frames 

listdf<-as.data.frame(listNum)
colnames(listdf)<-("Month")

#join both list data frames together into new data frame

tidydf<-cbind(tidydf, listdf)

head(tidydf)

Subsetting I

#Create subset of the first 10 observations 
subset1<-tidydf[1:10, ]
subset1

#Convert the subset to a matrix
subset_matrix<-data.matrix(subset1)
subset_matrix

##       Voyage_no Day Month_name Departure_date Departure_year Arrival_year
##  [1,]       227   1         10             11             43           44
##  [2,]       262  22          9            164             45           45
##  [3,]       338  24          1            181             51           51
##  [4,]       269   1          5              8             46           46
##  [5,]       341  27          4            208             51           51
##  [6,]       324  13          4             50             50           50
##  [7,]       187  26         10            206             41           42
##  [8,]       256   5          7            279             45           45
##  [9,]       256  14          1             71             46           46
## [10,]       239  17          5            106             44           44
##       Departure_port Ship No_convicts Month
##  [1,]             17   35         708    10
##  [2,]             17  640         510     9
##  [3,]             12   89         509     1
##  [4,]             19  629         507     5
##  [5,]             13   65         504     4
##  [6,]              5   73         486     4
##  [7,]             17  808         485    10
##  [8,]             13  275         476     7
##  [9,]             13  432         476     1
## [10,]              5   77         473     5

summary(subset_matrix)

##    Voyage_no          Day         Month_name   Departure_date   Departure_year 
##  Min.   :187.0   Min.   : 1.0   Min.   : 1.0   Min.   :  8.00   Min.   :41.00  
##  1st Qu.:243.2   1st Qu.: 7.0   1st Qu.: 4.0   1st Qu.: 55.25   1st Qu.:44.25  
##  Median :259.0   Median :15.5   Median : 5.0   Median :135.00   Median :45.50  
##  Mean   :269.9   Mean   :15.0   Mean   : 5.6   Mean   :128.40   Mean   :46.20  
##  3rd Qu.:310.2   3rd Qu.:23.5   3rd Qu.: 8.5   3rd Qu.:199.75   3rd Qu.:49.00  
##  Max.   :341.0   Max.   :27.0   Max.   :10.0   Max.   :279.00   Max.   :51.00  
##   Arrival_year   Departure_port       Ship        No_convicts        Month     
##  Min.   :42.00   Min.   : 5.00   Min.   : 35.0   Min.   :473.0   Min.   : 1.0  
##  1st Qu.:44.25   1st Qu.:12.25   1st Qu.: 74.0   1st Qu.:478.2   1st Qu.: 4.0  
##  Median :45.50   Median :13.00   Median :182.0   Median :495.0   Median : 5.0  
##  Mean   :46.40   Mean   :13.10   Mean   :312.3   Mean   :513.4   Mean   : 5.6  
##  3rd Qu.:49.00   3rd Qu.:17.00   3rd Qu.:579.8   3rd Qu.:508.5   3rd Qu.: 8.5  
##  Max.   :51.00   Max.   :19.00   Max.   :808.0   Max.   :708.0   Max.   :10.0

By converting the data frame to a matrix, the data frame has been converted to a numeric values. For qualitative values, the coercion seems to assign ordered numerical values.

Subsetting II

#subset the tidydf dataframe to include only the first and last variable
subset2<-tidydf[,c(1,10)]
subset2

#save as an RDS file
saveRDS(subset2, file="subset2.rds")

Create a new Data Frame

#Create data vector 1 - Ordinal

Day<- as.factor(c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))

#Assign levels

Day<- factor(Day,
                levels=c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))

#Show structure

str(Day)

##  Factor w/ 7 levels "Monday","Tuesday",..: 1 2 3 4 5 6 7

#Show levels

Day

## [1] Monday    Tuesday   Wednesday Thursday  Friday    Saturday  Sunday   
## Levels: Monday Tuesday Wednesday Thursday Friday Saturday Sunday

#Create data vector 2 - Numerical (Integer)

No_customers<-as.integer(c(24, 28, 23, 12, 17, 48, 42))

#Show structure

str(No_customers)

##  int [1:7] 24 28 23 12 17 48 42

#Create a dataframe with both vectors

df<-data.frame(Day, No_customers)
df

#Create another numeric vector

Sales<-as.double(c(245.25, 221.84, 184.12, 150.15, 174.24, 486.54, 423.84))
Sales

## [1] 245.25 221.84 184.12 150.15 174.24 486.54 423.84

#Show structure of new vector

str(Sales)

##  num [1:7] 245 222 184 150 174 ...

# Use cbind() to add new vector to data frame

df<-cbind(df, Sales)
df

# Check dimensions of updated data frame

dim(df)

## [1] 7 3

# Check attributes of updated data frame

str(df)

## 'data.frame':    7 obs. of  3 variables:
##  $ Day         : Factor w/ 7 levels "Monday","Tuesday",..: 1 2 3 4 5 6 7
##  $ No_customers: int  24 28 23 12 17 48 42
##  $ Sales       : num  245 222 184 150 174 ...

Create another Data Frame

#Create a common variable with previous dataset

Day<- as.factor(c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))

#Assign levels

Day<- factor(Day,
                levels=c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))

#Create a second variable for new dataset

Employee<- as.factor(c("Steven",
                       "Steven",
                        "Wendy", 
                        "Steven", 
                        "Jane", 
                        "Peter", 
                        "Peter"))

# create new data frame with the two new variables

newdf<-data.frame(Day, Employee)
newdf

# Join the new data frame to the previous data frame

join_df<-df %>%
  full_join(newdf, by="Day") 

join_df