library(readr)
library(stringr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
library(forestmangr)
options(warn=-1)
#assign address to url variable
url <- "http://data.gov.au/data/dataset/6d7bacba-8e0f-4105-b7bc-9fe11eda9f89/resource/8d3e12fa-a9fd-4268-9164-6240a97ff44d/download/convicts.csv"
#Read the data with only relevant columns specified
convicts <- read_csv(url) [,c(6,8:12)]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## CON_IDX = col_double(),
## POLICE_NO = col_double(),
## EXTRA_ID = col_double(),
## CHILD_FATHER = col_logical()
## )
## i Use `spec()` for the full column specifications.
#Review parsing errors
problems(convicts)
#view first few rows of imported dataset
head(convicts)
The address for the online csv file was assigned to a variable named ‘url’ to make it easier to read in.
The readr function ‘read_csv’ was used to read in the data as it is purported to be 10 times faster than equivalent base functions. The data was saved as a data frame called ‘convicts’.
Rather than importing the whole dataset which contained a lot of superfluous information, only relevant variables were selected to read in.
There was a parsing error when loading in the data. By using the ‘problems’ function I was able to see that the error was due to a difference in the expected v actual number of columns. This has no impact on my planned analysis so can be disregarded.
After importing the data, the ‘head’ function was used to view the first few rows of data.
The Tasmanian Convicts (1803-1893) dataset was sourced from data.gov.au at the following url: http://data.gov.au/data/dataset/6d7bacba-8e0f-4105-b7bc-9fe11eda9f89/resource/8d3e12fa-a9fd-4268-9164-6240a97ff44d/download/convicts.csv
It contains over 80,000 records of convicts who were either transported to Tasmania or convicted locally between 1803-1893.
Variable Descriptions:
#number of rows
nrow(convicts)
## [1] 81507
#number of columns
ncol(convicts)
## [1] 6
#alternately, see both rows and columns with dim() function
dim(convicts)
## [1] 81507 6
#Check the data types - currently all are Character type
sapply(convicts, class)
## VOYAGE_NO DEPARTURE_DATE SHIP NAME YEAR
## "character" "character" "character" "character" "character"
## DEPARTURE_PORT
## "character"
#Change column names from uppercase to proper
colnames(convicts)<-str_to_title(colnames(convicts))
#Reassign data types as appropriate
#cannot reassign Year yet due to multiple values - to be addressed later
convicts$Voyage_no<-as.integer(convicts$Voyage_no)
convicts$Departure_date<-as.factor(convicts$Departure_date)
convicts$Ship<-as.factor(convicts$Ship)
convicts$Name<-as.factor(convicts$Name)
convicts$Departure_port<-as.factor(convicts$Departure_port)
#Review class amendments
sapply(convicts, class)
## Voyage_no Departure_date Ship Name Year
## "integer" "factor" "factor" "factor" "character"
## Departure_port
## "factor"
head(convicts)
Reshape the dataset into a tidy format.
#Split out Departure_date column into Day, Month_name and Year (retain original column)
convicts$datesplit<-convicts$Departure_date
convicts<-convicts%>%
separate(datesplit, c("Day", "Month_name", "Yr"), sep=" ")
#Reassign new variables to correct data types
convicts$Day<-as.integer(convicts$Day)
convicts$Yr<-as.integer(convicts$Yr)
# Order levels for Month Variable
convicts$Month_name<-factor(convicts$Month_name,
levels=c('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'),
ordered=TRUE)
#Year column contains multiple values in some cells. Separate into separate columns.
convicts<-convicts %>%
separate(Year, into = c("Arrival_year", "Departure_year"))
#Replace missing values in Arrival Year with values in Departure Year. These are voyages that departed and arrived in the same year.
convicts$Departure_year<-ifelse(is.na(convicts$Departure_year), (convicts$Arrival_year), convicts$Departure_year)
#Reassign class to factor
convicts$Arrival_year<-as.factor(convicts$Arrival_year)
convicts$Departure_year<-as.factor(convicts$Departure_year)
head(convicts)
#Name column contains multiple values in some cells, separated by "|". Need to separate into separate columns.
#First, create a new variable to count how many | are in each cell, then n+1 = number of convicts per entry
convicts$n<-stringi::stri_count(convicts$Name, fixed = "|")+1
#Filter out any rows with no convicts
convicts<-convicts %>% filter(!is.na(n))
#Can now work out the maximum number of names in any cell
max(convicts$n)
## [1] 12
#Separate names into separate columns - 12 columns needed as determined in previous step
convicts<-convicts%>%
separate(Name, c("Name", "Name2", "Name3", "Name4", "Name5","Name6","Name7","Name8","Name9","Name10","Name11","Name12"), sep="[|]")
#create new data frames for each of the Name columns, filtered to only show rows with values
convicts1<-convicts %>% select(-(Name2:Name12)) %>% filter(!is.na(Name))
convicts2<-convicts %>% select(-(Name),-(Name3:Name12)) %>% filter(!is.na(Name2))
convicts3<-convicts %>% select(-(Name:Name2),-(Name4:Name12)) %>% filter(!is.na(Name3))
convicts4<-convicts %>% select(-(Name:Name3),-(Name5:Name12)) %>% filter(!is.na(Name4))
convicts5<-convicts %>% select(-(Name:Name4),-(Name6:Name12)) %>% filter(!is.na(Name5))
convicts6<-convicts %>% select(-(Name:Name5),-(Name7:Name12)) %>% filter(!is.na(Name6))
convicts7<-convicts %>% select(-(Name:Name6),-(Name8:Name12)) %>% filter(!is.na(Name7))
convicts8<-convicts %>% select(-(Name:Name7),-(Name9:Name12)) %>% filter(!is.na(Name8))
convicts9<-convicts %>% select(-(Name:Name8),-(Name10:Name12)) %>% filter(!is.na(Name9))
convicts10<-convicts %>% select(-(Name:Name9),-(Name11:Name12)) %>% filter(!is.na(Name10))
convicts11<-convicts %>% select(-(Name:Name10),-(Name12)) %>% filter(!is.na(Name11))
convicts12<-convicts %>% select(-(Name:Name11)) %>% filter(!is.na(Name12))
#Rename Name columns so all data frames have the same column names
names(convicts2)[names(convicts2)=='Name2']<-'Name'
names(convicts3)[names(convicts3)=='Name3']<-'Name'
names(convicts4)[names(convicts4)=='Name4']<-'Name'
names(convicts5)[names(convicts5)=='Name5']<-'Name'
names(convicts6)[names(convicts6)=='Name6']<-'Name'
names(convicts7)[names(convicts7)=='Name7']<-'Name'
names(convicts8)[names(convicts8)=='Name8']<-'Name'
names(convicts9)[names(convicts9)=='Name9']<-'Name'
names(convicts10)[names(convicts10)=='Name10']<-'Name'
names(convicts11)[names(convicts11)=='Name11']<-'Name'
names(convicts12)[names(convicts12)=='Name12']<-'Name'
#bind rows to merge all data frames back into one
convictsbind<-bind_rows(convicts1, convicts2, convicts3, convicts4, convicts5, convicts6, convicts7, convicts8, convicts9, convicts10, convicts11, convicts12)
#update the original data frame with the changes
convicts<-convictsbind
#Now we can separate the Name column into Surname and First_name
convicts<-convicts%>%
separate(Name, c("Surname", "First_name"), sep=",")
#filter out duplicate entries & other faulty data
convicts<-convicts[!duplicated(convicts),]
convicts<-convicts %>% filter(Ship!="1804|1803")
#reorder df to suit preference
convicts<-convicts %>% select (Voyage_no, Day, Month_name, Departure_date, Departure_year, Arrival_year, Departure_port, Ship, Surname, First_name)
head(convicts)
#Address Missing Values for observations with Voyage no
#Create a dataframe that includes everything except convict names
completeDF<-convicts %>% select(everything(), -Surname, -First_name) %>% arrange(Voyage_no)
#Reduce down to complete cases
completeDF<-completeDF[complete.cases(completeDF), ]
#Remove Duplicates
completeDF<-completeDF[!duplicated(completeDF),]
#Merge data where Voyage_no and Departure_date is not na
merge<-convicts %>%
filter(!is.na(Voyage_no))%>%
left_join(completeDF, by="Voyage_no") %>%
select(Voyage_no, Day.y, Month_name.y, Departure_date.y, Departure_year.y, Arrival_year.y, Departure_port.y, Ship.y, Surname, First_name)
merge<-merge %>% filter(!is.na(Ship.y))
#Rename columns
names(merge)[names(merge)=='Day.y']<-'Day'
names(merge)[names(merge)=='Month_name.y']<-'Month_name'
names(merge)[names(merge)=='Departure_date.y']<-'Departure_date'
names(merge)[names(merge)=='Departure_year.y']<-'Departure_year'
names(merge)[names(merge)=='Arrival_year.y']<-'Arrival_year'
names(merge)[names(merge)=='Departure_port.y']<-'Departure_port'
names(merge)[names(merge)=='Ship.y']<-'Ship'
#Check all NA values addressed
summary(merge)
## Voyage_no Day Month_name Departure_date
## Min. : 1.0 Min. : 1.00 Apr :14183 14 Jan 1846: 895
## 1st Qu.:118.0 1st Qu.: 8.00 Sep : 9354 30 Apr 1848: 792
## Median :199.0 Median :15.00 May : 8535 24 Nov 1852: 730
## Mean :198.7 Mean :15.66 Jul : 8477 16 Apr 1843: 716
## 3rd Qu.:288.0 3rd Qu.:24.00 Jun : 8206 1 Oct 1843 : 708
## Max. :371.0 Max. :31.00 Aug : 8024 2 Apr 1842 : 652
## (Other):35875 (Other) :88161
## Departure_year Arrival_year Departure_port Ship
## 1842 : 6607 1844 : 7299 London :31775 Joseph Somes (1): 895
## 1844 : 6366 1842 : 7204 Portsmouth:10101 St Vincent : 717
## 1843 : 5546 1845 : 5459 Plymouth : 9811 John Calvin : 716
## 1845 : 5109 1843 : 5180 Dublin : 6736 Anson : 708
## 1852 : 4904 1841 : 4347 Downs : 6227 Waverley (3) : 590
## 1841 : 4505 1850 : 4219 Woolwich : 5880 Sarah : 558
## (Other):59617 (Other):58946 (Other) :22124 (Other) :88470
## Surname First_name
## Length:92654 Length:92654
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
#summarise dataset by number of convicts on each ship
tidydf<-merge %>%
group_by(Voyage_no, Day, Month_name, Departure_date, Departure_year, Arrival_year, Departure_port, Ship) %>%
summarise(No_convicts=n()) %>%
arrange(desc(No_convicts))
## `summarise()` has grouped output by 'Voyage_no', 'Day', 'Month_name', 'Departure_date', 'Departure_year', 'Arrival_year', 'Departure_port'. You can override using the `.groups` argument.
head(tidydf)
Provide summary statistics (mean, median, minimum, maximum, standard deviation) of total number of convicts (numerical) grouped by year of departure (categorical)
sumstats<-tidydf %>%
group_by(Departure_year) %>%
summarise(mean = mean(No_convicts),
median = median(No_convicts),
min = min(No_convicts),
max = max(No_convicts),
sd = sd(No_convicts))
#use forestmangr package to round all df values
round_df(sumstats)
#create list of Month name as a numeric value
listNum<-list(as.numeric(tidydf$Month_name))
#convert the list to data frames
listdf<-as.data.frame(listNum)
colnames(listdf)<-("Month")
#join both list data frames together into new data frame
tidydf<-cbind(tidydf, listdf)
head(tidydf)
#Create subset of the first 10 observations
subset1<-tidydf[1:10, ]
subset1
#Convert the subset to a matrix
subset_matrix<-data.matrix(subset1)
subset_matrix
## Voyage_no Day Month_name Departure_date Departure_year Arrival_year
## [1,] 227 1 10 11 43 44
## [2,] 262 22 9 164 45 45
## [3,] 338 24 1 181 51 51
## [4,] 269 1 5 8 46 46
## [5,] 341 27 4 208 51 51
## [6,] 324 13 4 50 50 50
## [7,] 187 26 10 206 41 42
## [8,] 256 5 7 279 45 45
## [9,] 256 14 1 71 46 46
## [10,] 239 17 5 106 44 44
## Departure_port Ship No_convicts Month
## [1,] 17 35 708 10
## [2,] 17 640 510 9
## [3,] 12 89 509 1
## [4,] 19 629 507 5
## [5,] 13 65 504 4
## [6,] 5 73 486 4
## [7,] 17 808 485 10
## [8,] 13 275 476 7
## [9,] 13 432 476 1
## [10,] 5 77 473 5
summary(subset_matrix)
## Voyage_no Day Month_name Departure_date Departure_year
## Min. :187.0 Min. : 1.0 Min. : 1.0 Min. : 8.00 Min. :41.00
## 1st Qu.:243.2 1st Qu.: 7.0 1st Qu.: 4.0 1st Qu.: 55.25 1st Qu.:44.25
## Median :259.0 Median :15.5 Median : 5.0 Median :135.00 Median :45.50
## Mean :269.9 Mean :15.0 Mean : 5.6 Mean :128.40 Mean :46.20
## 3rd Qu.:310.2 3rd Qu.:23.5 3rd Qu.: 8.5 3rd Qu.:199.75 3rd Qu.:49.00
## Max. :341.0 Max. :27.0 Max. :10.0 Max. :279.00 Max. :51.00
## Arrival_year Departure_port Ship No_convicts Month
## Min. :42.00 Min. : 5.00 Min. : 35.0 Min. :473.0 Min. : 1.0
## 1st Qu.:44.25 1st Qu.:12.25 1st Qu.: 74.0 1st Qu.:478.2 1st Qu.: 4.0
## Median :45.50 Median :13.00 Median :182.0 Median :495.0 Median : 5.0
## Mean :46.40 Mean :13.10 Mean :312.3 Mean :513.4 Mean : 5.6
## 3rd Qu.:49.00 3rd Qu.:17.00 3rd Qu.:579.8 3rd Qu.:508.5 3rd Qu.: 8.5
## Max. :51.00 Max. :19.00 Max. :808.0 Max. :708.0 Max. :10.0
By converting the data frame to a matrix, the data frame has been converted to a numeric values. For qualitative values, the coercion seems to assign ordered numerical values.
#subset the tidydf dataframe to include only the first and last variable
subset2<-tidydf[,c(1,10)]
subset2
#save as an RDS file
saveRDS(subset2, file="subset2.rds")
#Create data vector 1 - Ordinal
Day<- as.factor(c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"))
#Assign levels
Day<- factor(Day,
levels=c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"))
#Show structure
str(Day)
## Factor w/ 7 levels "Monday","Tuesday",..: 1 2 3 4 5 6 7
#Show levels
Day
## [1] Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## Levels: Monday Tuesday Wednesday Thursday Friday Saturday Sunday
#Create data vector 2 - Numerical (Integer)
No_customers<-as.integer(c(24, 28, 23, 12, 17, 48, 42))
#Show structure
str(No_customers)
## int [1:7] 24 28 23 12 17 48 42
#Create a dataframe with both vectors
df<-data.frame(Day, No_customers)
df
#Create another numeric vector
Sales<-as.double(c(245.25, 221.84, 184.12, 150.15, 174.24, 486.54, 423.84))
Sales
## [1] 245.25 221.84 184.12 150.15 174.24 486.54 423.84
#Show structure of new vector
str(Sales)
## num [1:7] 245 222 184 150 174 ...
# Use cbind() to add new vector to data frame
df<-cbind(df, Sales)
df
# Check dimensions of updated data frame
dim(df)
## [1] 7 3
# Check attributes of updated data frame
str(df)
## 'data.frame': 7 obs. of 3 variables:
## $ Day : Factor w/ 7 levels "Monday","Tuesday",..: 1 2 3 4 5 6 7
## $ No_customers: int 24 28 23 12 17 48 42
## $ Sales : num 245 222 184 150 174 ...
#Create a common variable with previous dataset
Day<- as.factor(c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"))
#Assign levels
Day<- factor(Day,
levels=c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"))
#Create a second variable for new dataset
Employee<- as.factor(c("Steven",
"Steven",
"Wendy",
"Steven",
"Jane",
"Peter",
"Peter"))
# create new data frame with the two new variables
newdf<-data.frame(Day, Employee)
newdf
# Join the new data frame to the previous data frame
join_df<-df %>%
full_join(newdf, by="Day")
join_df