Setup

library(readr)
library(stringr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)
library(forestmangr)
options(warn=-1)

Read/Import Data

#assign address to url variable
url <- "http://data.gov.au/data/dataset/6d7bacba-8e0f-4105-b7bc-9fe11eda9f89/resource/8d3e12fa-a9fd-4268-9164-6240a97ff44d/download/convicts.csv"

#Read the data with only relevant columns specified

convicts <- read_csv(url) [,c(6,8:12)]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   CON_IDX = col_double(),
##   POLICE_NO = col_double(),
##   EXTRA_ID = col_double(),
##   CHILD_FATHER = col_logical()
## )
## i Use `spec()` for the full column specifications.
#Review parsing errors

problems(convicts)
#view first few rows of imported dataset

head(convicts)

Steps taken to Import Data

  • The address for the online csv file was assigned to a variable named ‘url’ to make it easier to read in.

  • The readr function ‘read_csv’ was used to read in the data as it is purported to be 10 times faster than equivalent base functions. The data was saved as a data frame called ‘convicts’.

  • Rather than importing the whole dataset which contained a lot of superfluous information, only relevant variables were selected to read in.

  • There was a parsing error when loading in the data. By using the ‘problems’ function I was able to see that the error was due to a difference in the expected v actual number of columns. This has no impact on my planned analysis so can be disregarded.

  • After importing the data, the ‘head’ function was used to view the first few rows of data.

Data description

The Tasmanian Convicts (1803-1893) dataset was sourced from data.gov.au at the following url: http://data.gov.au/data/dataset/6d7bacba-8e0f-4105-b7bc-9fe11eda9f89/resource/8d3e12fa-a9fd-4268-9164-6240a97ff44d/download/convicts.csv

It contains over 80,000 records of convicts who were either transported to Tasmania or convicted locally between 1803-1893.

Variable Descriptions:

Inspect dataset and variables

#number of rows

nrow(convicts)
## [1] 81507
#number of columns

ncol(convicts)
## [1] 6
#alternately, see both rows and columns with dim() function

dim(convicts)
## [1] 81507     6
#Check the data types - currently all are Character type

sapply(convicts, class)
##      VOYAGE_NO DEPARTURE_DATE           SHIP           NAME           YEAR 
##    "character"    "character"    "character"    "character"    "character" 
## DEPARTURE_PORT 
##    "character"
#Change column names from uppercase to proper

colnames(convicts)<-str_to_title(colnames(convicts))

#Reassign data types as appropriate 
#cannot reassign Year yet due to multiple values - to be addressed later

convicts$Voyage_no<-as.integer(convicts$Voyage_no)
convicts$Departure_date<-as.factor(convicts$Departure_date)
convicts$Ship<-as.factor(convicts$Ship)
convicts$Name<-as.factor(convicts$Name)
convicts$Departure_port<-as.factor(convicts$Departure_port)

#Review class amendments
sapply(convicts, class)
##      Voyage_no Departure_date           Ship           Name           Year 
##      "integer"       "factor"       "factor"       "factor"    "character" 
## Departure_port 
##       "factor"
head(convicts)

Tidy data

Reshape the dataset into a tidy format.

#Split out Departure_date column into Day, Month_name and Year (retain original column)

convicts$datesplit<-convicts$Departure_date

convicts<-convicts%>%
  separate(datesplit, c("Day", "Month_name", "Yr"), sep=" ")

#Reassign new variables to correct data types

convicts$Day<-as.integer(convicts$Day)
convicts$Yr<-as.integer(convicts$Yr)
# Order levels for Month Variable

convicts$Month_name<-factor(convicts$Month_name,
              levels=c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'),
              ordered=TRUE)
#Year column contains multiple values in some cells. Separate into separate columns.

convicts<-convicts %>% 
  separate(Year, into = c("Arrival_year", "Departure_year"))

#Replace missing values in Arrival Year with values in Departure Year. These are voyages that departed and arrived in the same year.

convicts$Departure_year<-ifelse(is.na(convicts$Departure_year), (convicts$Arrival_year), convicts$Departure_year)

#Reassign class to factor

convicts$Arrival_year<-as.factor(convicts$Arrival_year)
convicts$Departure_year<-as.factor(convicts$Departure_year)

head(convicts)
#Name column contains multiple values in some cells, separated by "|". Need to separate into separate columns. 

#First, create a new variable to count how many | are in each cell, then n+1 = number of convicts per entry

convicts$n<-stringi::stri_count(convicts$Name, fixed = "|")+1

#Filter out any rows with no convicts

convicts<-convicts %>% filter(!is.na(n))

#Can now work out the maximum number of names in any cell

max(convicts$n)
## [1] 12
#Separate names into separate columns - 12 columns needed as determined in previous step

convicts<-convicts%>%
   separate(Name, c("Name", "Name2", "Name3", "Name4", "Name5","Name6","Name7","Name8","Name9","Name10","Name11","Name12"), sep="[|]")

#create new data frames for each of the Name columns, filtered to only show rows with values

convicts1<-convicts %>% select(-(Name2:Name12)) %>% filter(!is.na(Name))
convicts2<-convicts %>% select(-(Name),-(Name3:Name12)) %>% filter(!is.na(Name2))
convicts3<-convicts %>% select(-(Name:Name2),-(Name4:Name12)) %>% filter(!is.na(Name3))
convicts4<-convicts %>% select(-(Name:Name3),-(Name5:Name12)) %>% filter(!is.na(Name4))
convicts5<-convicts %>% select(-(Name:Name4),-(Name6:Name12)) %>% filter(!is.na(Name5))
convicts6<-convicts %>% select(-(Name:Name5),-(Name7:Name12)) %>% filter(!is.na(Name6))
convicts7<-convicts %>% select(-(Name:Name6),-(Name8:Name12)) %>% filter(!is.na(Name7))
convicts8<-convicts %>% select(-(Name:Name7),-(Name9:Name12)) %>% filter(!is.na(Name8))
convicts9<-convicts %>% select(-(Name:Name8),-(Name10:Name12)) %>% filter(!is.na(Name9))
convicts10<-convicts %>% select(-(Name:Name9),-(Name11:Name12)) %>% filter(!is.na(Name10))
convicts11<-convicts %>% select(-(Name:Name10),-(Name12)) %>% filter(!is.na(Name11))
convicts12<-convicts %>% select(-(Name:Name11)) %>% filter(!is.na(Name12))

#Rename Name columns so all data frames have the same column names

names(convicts2)[names(convicts2)=='Name2']<-'Name'
names(convicts3)[names(convicts3)=='Name3']<-'Name'
names(convicts4)[names(convicts4)=='Name4']<-'Name'
names(convicts5)[names(convicts5)=='Name5']<-'Name'
names(convicts6)[names(convicts6)=='Name6']<-'Name'
names(convicts7)[names(convicts7)=='Name7']<-'Name'
names(convicts8)[names(convicts8)=='Name8']<-'Name'
names(convicts9)[names(convicts9)=='Name9']<-'Name'
names(convicts10)[names(convicts10)=='Name10']<-'Name'
names(convicts11)[names(convicts11)=='Name11']<-'Name'
names(convicts12)[names(convicts12)=='Name12']<-'Name'

#bind rows to merge all data frames back into one

convictsbind<-bind_rows(convicts1, convicts2, convicts3, convicts4, convicts5, convicts6, convicts7, convicts8, convicts9, convicts10, convicts11, convicts12)

#update the original data frame with the changes 

convicts<-convictsbind
#Now we can separate the Name column into Surname and First_name

convicts<-convicts%>%
  separate(Name, c("Surname", "First_name"), sep=",")

#filter out duplicate entries & other faulty data

convicts<-convicts[!duplicated(convicts),]
convicts<-convicts %>% filter(Ship!="1804|1803")

#reorder df to suit preference

convicts<-convicts %>% select (Voyage_no, Day, Month_name, Departure_date, Departure_year, Arrival_year, Departure_port, Ship, Surname, First_name)

head(convicts)
#Address Missing Values for observations with Voyage no 

#Create a dataframe that includes everything except convict names 

completeDF<-convicts %>% select(everything(), -Surname, -First_name) %>% arrange(Voyage_no)

#Reduce down to complete cases

completeDF<-completeDF[complete.cases(completeDF), ]

#Remove Duplicates

completeDF<-completeDF[!duplicated(completeDF),]

#Merge data where Voyage_no and Departure_date is not na

merge<-convicts %>% 
  filter(!is.na(Voyage_no))%>% 
  left_join(completeDF, by="Voyage_no") %>% 
  select(Voyage_no, Day.y, Month_name.y, Departure_date.y, Departure_year.y, Arrival_year.y, Departure_port.y, Ship.y, Surname, First_name)
merge<-merge %>% filter(!is.na(Ship.y))

#Rename columns

names(merge)[names(merge)=='Day.y']<-'Day'
names(merge)[names(merge)=='Month_name.y']<-'Month_name'
names(merge)[names(merge)=='Departure_date.y']<-'Departure_date'
names(merge)[names(merge)=='Departure_year.y']<-'Departure_year'
names(merge)[names(merge)=='Arrival_year.y']<-'Arrival_year'
names(merge)[names(merge)=='Departure_port.y']<-'Departure_port'
names(merge)[names(merge)=='Ship.y']<-'Ship'
#Check all NA values addressed

summary(merge)
##    Voyage_no          Day          Month_name        Departure_date 
##  Min.   :  1.0   Min.   : 1.00   Apr    :14183   14 Jan 1846:  895  
##  1st Qu.:118.0   1st Qu.: 8.00   Sep    : 9354   30 Apr 1848:  792  
##  Median :199.0   Median :15.00   May    : 8535   24 Nov 1852:  730  
##  Mean   :198.7   Mean   :15.66   Jul    : 8477   16 Apr 1843:  716  
##  3rd Qu.:288.0   3rd Qu.:24.00   Jun    : 8206   1 Oct 1843 :  708  
##  Max.   :371.0   Max.   :31.00   Aug    : 8024   2 Apr 1842 :  652  
##                                  (Other):35875   (Other)    :88161  
##  Departure_year   Arrival_year      Departure_port                Ship      
##  1842   : 6607   1844   : 7299   London    :31775   Joseph Somes (1):  895  
##  1844   : 6366   1842   : 7204   Portsmouth:10101   St Vincent      :  717  
##  1843   : 5546   1845   : 5459   Plymouth  : 9811   John Calvin     :  716  
##  1845   : 5109   1843   : 5180   Dublin    : 6736   Anson           :  708  
##  1852   : 4904   1841   : 4347   Downs     : 6227   Waverley (3)    :  590  
##  1841   : 4505   1850   : 4219   Woolwich  : 5880   Sarah           :  558  
##  (Other):59617   (Other):58946   (Other)   :22124   (Other)         :88470  
##    Surname           First_name       
##  Length:92654       Length:92654      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
#summarise dataset by number of convicts on each ship

tidydf<-merge %>% 
  group_by(Voyage_no, Day, Month_name, Departure_date, Departure_year, Arrival_year, Departure_port, Ship) %>% 
  summarise(No_convicts=n()) %>% 
  arrange(desc(No_convicts))
## `summarise()` has grouped output by 'Voyage_no', 'Day', 'Month_name', 'Departure_date', 'Departure_year', 'Arrival_year', 'Departure_port'. You can override using the `.groups` argument.
head(tidydf)

Summary statistics

Provide summary statistics (mean, median, minimum, maximum, standard deviation) of total number of convicts (numerical) grouped by year of departure (categorical)

sumstats<-tidydf %>% 
  group_by(Departure_year) %>% 
  summarise(mean = mean(No_convicts), 
            median = median(No_convicts), 
            min = min(No_convicts), 
            max = max(No_convicts), 
            sd = sd(No_convicts))

#use forestmangr package to round all df values

round_df(sumstats)

Create a list

#create list of Month name as a numeric value

listNum<-list(as.numeric(tidydf$Month_name))

Join the list to original data frame

#convert the list to data frames 

listdf<-as.data.frame(listNum)
colnames(listdf)<-("Month")

#join both list data frames together into new data frame

tidydf<-cbind(tidydf, listdf)

head(tidydf)

Subsetting I

#Create subset of the first 10 observations 
subset1<-tidydf[1:10, ]
subset1
#Convert the subset to a matrix
subset_matrix<-data.matrix(subset1)
subset_matrix
##       Voyage_no Day Month_name Departure_date Departure_year Arrival_year
##  [1,]       227   1         10             11             43           44
##  [2,]       262  22          9            164             45           45
##  [3,]       338  24          1            181             51           51
##  [4,]       269   1          5              8             46           46
##  [5,]       341  27          4            208             51           51
##  [6,]       324  13          4             50             50           50
##  [7,]       187  26         10            206             41           42
##  [8,]       256   5          7            279             45           45
##  [9,]       256  14          1             71             46           46
## [10,]       239  17          5            106             44           44
##       Departure_port Ship No_convicts Month
##  [1,]             17   35         708    10
##  [2,]             17  640         510     9
##  [3,]             12   89         509     1
##  [4,]             19  629         507     5
##  [5,]             13   65         504     4
##  [6,]              5   73         486     4
##  [7,]             17  808         485    10
##  [8,]             13  275         476     7
##  [9,]             13  432         476     1
## [10,]              5   77         473     5
summary(subset_matrix)
##    Voyage_no          Day         Month_name   Departure_date   Departure_year 
##  Min.   :187.0   Min.   : 1.0   Min.   : 1.0   Min.   :  8.00   Min.   :41.00  
##  1st Qu.:243.2   1st Qu.: 7.0   1st Qu.: 4.0   1st Qu.: 55.25   1st Qu.:44.25  
##  Median :259.0   Median :15.5   Median : 5.0   Median :135.00   Median :45.50  
##  Mean   :269.9   Mean   :15.0   Mean   : 5.6   Mean   :128.40   Mean   :46.20  
##  3rd Qu.:310.2   3rd Qu.:23.5   3rd Qu.: 8.5   3rd Qu.:199.75   3rd Qu.:49.00  
##  Max.   :341.0   Max.   :27.0   Max.   :10.0   Max.   :279.00   Max.   :51.00  
##   Arrival_year   Departure_port       Ship        No_convicts        Month     
##  Min.   :42.00   Min.   : 5.00   Min.   : 35.0   Min.   :473.0   Min.   : 1.0  
##  1st Qu.:44.25   1st Qu.:12.25   1st Qu.: 74.0   1st Qu.:478.2   1st Qu.: 4.0  
##  Median :45.50   Median :13.00   Median :182.0   Median :495.0   Median : 5.0  
##  Mean   :46.40   Mean   :13.10   Mean   :312.3   Mean   :513.4   Mean   : 5.6  
##  3rd Qu.:49.00   3rd Qu.:17.00   3rd Qu.:579.8   3rd Qu.:508.5   3rd Qu.: 8.5  
##  Max.   :51.00   Max.   :19.00   Max.   :808.0   Max.   :708.0   Max.   :10.0

By converting the data frame to a matrix, the data frame has been converted to a numeric values. For qualitative values, the coercion seems to assign ordered numerical values.

Subsetting II

#subset the tidydf dataframe to include only the first and last variable
subset2<-tidydf[,c(1,10)]
subset2
#save as an RDS file
saveRDS(subset2, file="subset2.rds")

Create a new Data Frame

#Create data vector 1 - Ordinal

Day<- as.factor(c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))

#Assign levels

Day<- factor(Day,
                levels=c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))

#Show structure

str(Day)
##  Factor w/ 7 levels "Monday","Tuesday",..: 1 2 3 4 5 6 7
#Show levels

Day
## [1] Monday    Tuesday   Wednesday Thursday  Friday    Saturday  Sunday   
## Levels: Monday Tuesday Wednesday Thursday Friday Saturday Sunday
#Create data vector 2 - Numerical (Integer)

No_customers<-as.integer(c(24, 28, 23, 12, 17, 48, 42))

#Show structure

str(No_customers)
##  int [1:7] 24 28 23 12 17 48 42
#Create a dataframe with both vectors

df<-data.frame(Day, No_customers)
df
#Create another numeric vector

Sales<-as.double(c(245.25, 221.84, 184.12, 150.15, 174.24, 486.54, 423.84))
Sales
## [1] 245.25 221.84 184.12 150.15 174.24 486.54 423.84
#Show structure of new vector

str(Sales)
##  num [1:7] 245 222 184 150 174 ...
# Use cbind() to add new vector to data frame

df<-cbind(df, Sales)
df
# Check dimensions of updated data frame

dim(df)
## [1] 7 3
# Check attributes of updated data frame

str(df)
## 'data.frame':    7 obs. of  3 variables:
##  $ Day         : Factor w/ 7 levels "Monday","Tuesday",..: 1 2 3 4 5 6 7
##  $ No_customers: int  24 28 23 12 17 48 42
##  $ Sales       : num  245 222 184 150 174 ...

Create another Data Frame

#Create a common variable with previous dataset

Day<- as.factor(c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))

#Assign levels

Day<- factor(Day,
                levels=c("Monday",
                  "Tuesday",
                  "Wednesday",
                  "Thursday",
                  "Friday",
                  "Saturday",
                  "Sunday"))
#Create a second variable for new dataset

Employee<- as.factor(c("Steven",
                       "Steven",
                        "Wendy", 
                        "Steven", 
                        "Jane", 
                        "Peter", 
                        "Peter"))
# create new data frame with the two new variables

newdf<-data.frame(Day, Employee)
newdf
# Join the new data frame to the previous data frame

join_df<-df %>%
  full_join(newdf, by="Day") 

join_df