Setup

# Load packages 
library(readr)
library(knitr)
library(tidyr) 
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr) 
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract

Read/Import data

# Read/Import Data
australia_cities <- data.frame(read_csv("au.csv"))
## Rows: 1035 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): city, country, iso2, admin_name, capital
## dbl (4): lat, lng, population, population_proper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(australia_cities)

Explanation of actions taken:

Data description

The data set is about Australian cities and it provides information such as latitude, longitude, city type and population on 1,035 locations (Marya Alizadeh 2020, para. 1). The source of data is https://www.kaggle.com/maryamalizadeh/worldcities-australia. The webpage states that “this data had been subsetted from the main data set which was available on https://simplemaps.com/data/world-cities” (Marya Alizadeh 2020, para. 2).

The australia_cities data set includes nine variables:

Inspect dataset and variables

# Check dimensions of the data frame
dim(australia_cities) 
## [1] 1035    9
# Check data types of the variables individually
class(australia_cities$city) 
## [1] "character"
class(australia_cities$lat)
## [1] "numeric"
class(australia_cities$lng)
## [1] "numeric"
class(australia_cities$country)
## [1] "character"
class(australia_cities$iso2)
## [1] "character"
class(australia_cities$admin_name)
## [1] "character"
class(australia_cities$capital)
## [1] "character"
class(australia_cities$population)
## [1] "numeric"
class(australia_cities$population_proper)
## [1] "numeric"
# Convert variable australia_cities$admin_name to factor variable and define levels
australia_cities$admin_name <- factor(australia_cities$admin_name, levels = c("New South Wales", "Victoria", "Queensland",
                                                                              "Western Australia","South Australia", "Australian Capital Territory", "Tasmania", "Northern Territory"))
class(australia_cities$admin_name)
## [1] "factor"
levels(australia_cities$admin_name)
## [1] "New South Wales"              "Victoria"                    
## [3] "Queensland"                   "Western Australia"           
## [5] "South Australia"              "Australian Capital Territory"
## [7] "Tasmania"                     "Northern Territory"
# Convert variable australia_cities$capital to factor and define levels
australia_cities$capital <- factor(australia_cities$capital, levels = c("primary", "admin"), exclude = NA)
class(australia_cities$capital)
## [1] "factor"
levels(australia_cities$capital)
## [1] "primary" "admin"
# Check the column names in the data frame
colnames(australia_cities)
## [1] "city"              "lat"               "lng"              
## [4] "country"           "iso2"              "admin_name"       
## [7] "capital"           "population"        "population_proper"
# Define another data frame
australia_cities_1 <- australia_cities

# Rename variables/columns to be more informative
colnames(australia_cities_1) <- c("city", "latitude", "longitude", "country", "country_code", "state", "city_type", "urban_population", "municipal_population")
colnames(australia_cities_1)
## [1] "city"                 "latitude"             "longitude"           
## [4] "country"              "country_code"         "state"               
## [7] "city_type"            "urban_population"     "municipal_population"
head(australia_cities_1)

Explanation of actions taken:

Tidy data

# check variable names
colnames(australia_cities_1)
## [1] "city"                 "latitude"             "longitude"           
## [4] "country"              "country_code"         "state"               
## [7] "city_type"            "urban_population"     "municipal_population"
# Check and tidy observations
n_distinct(australia_cities_1$city)
## [1] 1026
australia_cities_1 %>% 
  filter(duplicated(australia_cities_1$city) == TRUE)
australia_cities_2 <- australia_cities_1[!duplicated(australia_cities_1$city), ]
australia_cities_2
# Remove unnecessary variables
australia_cities_3 <- australia_cities_2 %>% 
  select(-(country:country_code))
australia_cities_3
# Replace values of factor variable with more informative terms
australia_cities_4 <- australia_cities_3
levels(australia_cities_4$city_type)[1] <- "nation-capital"
levels(australia_cities_4$city_type)[2] <- "state-capital"
levels(australia_cities_4$city_type)[3] <- "non-capital"
australia_cities_4 <- australia_cities_4 %>% 
  mutate(city_type = replace_na(australia_cities_4$city_type, "non-capital"))
australia_cities_4

Wickham and Grolemund (2017, p. 149) argue that “there are three principles to tidy data:

They (2017, p. 152) also mention that “there are two common problems with untidy data:

Explanation of actions taken:

Now, the data frame complies with tidy data principles (Wickham & Grolemund 2017). However, there is still more to be done to turn it into a clean and meaningful data frame:

Summary statistics

# Grouped summary statistics on urban_population
australia_cities_4 %>% 
  group_by(state) %>% 
  summarise(urban_pop_mean = mean(urban_population),
            urban_pop_median = median(urban_population),
            urban_pop_min = min(urban_population),
            urban_pop_max = max(urban_population),
            urban_pop_Std_dev = sd(urban_population))
# Grouped summary statistics on municipal_population
australia_cities_4 %>% 
  group_by(state) %>% 
  summarise(municipal_pop_mean = mean(municipal_population),
            municipal_pop_median = median(municipal_population),
            municipal_pop_min = min(municipal_population),
            municipal_pop_max = max(municipal_population),
            municipal_pop_Std_dev = sd(municipal_population))

Explanation of actions taken:

Create a list

# Create a factor vector corresponding to state variable
state <- factor(c("New South Wales", "Victoria", "Queensland", "Western Australia",
                              "South Australia", "Tasmania", "Australian Capital Territory", "Northern Territory"),
                levels = c("New South Wales", "Victoria", "Queensland", "Western Australia",
                           "South Australia", "Tasmania", "Australian Capital Territory", "Northern Territory"))
state
## [1] New South Wales              Victoria                    
## [3] Queensland                   Western Australia           
## [5] South Australia              Tasmania                    
## [7] Australian Capital Territory Northern Territory          
## 8 Levels: New South Wales Victoria Queensland ... Northern Territory
# Create a numeric vector for economic rank of the state in Australia
economic_rank_of_state <- c(1, 2, 3, 4, 5, 6, 7, 8)
economic_rank_of_state
## [1] 1 2 3 4 5 6 7 8
# Combine vectors to create a list
state_economic_performance <- list(state, economic_rank_of_state)
state_economic_performance
## [[1]]
## [1] New South Wales              Victoria                    
## [3] Queensland                   Western Australia           
## [5] South Australia              Tasmania                    
## [7] Australian Capital Territory Northern Territory          
## 8 Levels: New South Wales Victoria Queensland ... Northern Territory
## 
## [[2]]
## [1] 1 2 3 4 5 6 7 8

Explanation of actions taken:

Join the list

# Convert the newly created list to a data frame
state_economic_performance_df <- as.data.frame(state_economic_performance)
class(state_economic_performance_df)
## [1] "data.frame"
colnames(state_economic_performance_df)
## [1] "structure.1.8..levels...c..New.South.Wales....Victoria....Queensland..."
## [2] "c.1..2..3..4..5..6..7..8."
colnames(state_economic_performance_df) <- c("state", "economic_rank_of_state")
colnames(state_economic_performance_df)
## [1] "state"                  "economic_rank_of_state"
state_economic_performance_df
# Join the newly created data frame onto the original data frame using left_join() function
australia_cities_5 <- australia_cities_4 %>% 
  left_join(state_economic_performance_df, by = "state")
head(australia_cities_5, 10)
tail(australia_cities_5, 10)

Explanation of actions taken:

Subsetting I

# Subset the first ten observations of the data frame
subset_1 <- australia_cities_5[1:10, ]
subset_1
# Convert to a matrix and check structure
matrix <- as.matrix(subset_1)
class(matrix)
## [1] "matrix" "array"
typeof(matrix)
## [1] "character"
str(matrix)
##  chr [1:10, 1:8] "Sydney" "Melbourne" "Brisbane" "Perth" "Adelaide" ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:10] "1" "2" "3" "4" ...
##   ..$ : chr [1:8] "city" "latitude" "longitude" "state" ...

Explanation of actions taken:

Subsetting II

# Subset the first and the last variables of the data frame
subset_2 <- australia_cities_5 %>% 
  select(city, economic_rank_of_state)
head(subset_2)
# Save the subset as a .RData file
save(subset_2, file = "subset_2.RData")

Explanation of actions taken:

Create a new Data Frame

# Create an ordinal variable
data_wrangling_final <- factor( c("NN","PA", "CR", "DI", "HD"), 
                      levels = c("NN", "PA", "CR", "DI", "HD"),     
                      ordered=TRUE) 
str(data_wrangling_final)
##  Ord.factor w/ 5 levels "NN"<"PA"<"CR"<..: 1 2 3 4 5
levels(data_wrangling_final)
## [1] "NN" "PA" "CR" "DI" "HD"
# Create an integer variable
number_of_students <- c(5L, 8L, 10L, 5L, 2L)
str(number_of_students)
##  int [1:5] 5 8 10 5 2
# create a data frame from the ordinal and the integer variables
data_wrangling_report <- data.frame(data_wrangling_final, number_of_students)
data_wrangling_report
# Create a numeric vector and bind it onto the data frame
mean_study_hours_weekly <- c(3.1, 10.4, 15.7, 21.5, 33.3)
class(mean_study_hours_weekly)
## [1] "numeric"
data_wrangling_report_2 <- cbind(data_wrangling_report, mean_study_hours_weekly)
data_wrangling_report_2
dim(data_wrangling_report_2)
## [1] 5 3
str(data_wrangling_report_2)
## 'data.frame':    5 obs. of  3 variables:
##  $ data_wrangling_final   : Ord.factor w/ 5 levels "NN"<"PA"<"CR"<..: 1 2 3 4 5
##  $ number_of_students     : int  5 8 10 5 2
##  $ mean_study_hours_weekly: num  3.1 10.4 15.7 21.5 33.3

Explanation of actions taken:

Create another Data Frame

# Create another data frame with common categorical variable to data_wrangling_report_2
mean_years_experience_data_field <- c(0.7, 3.3, 3.7, 5.8, 4.5)
data_frame_experience <- data.frame(data_wrangling_final, mean_years_experience_data_field)
str(data_frame_experience)
## 'data.frame':    5 obs. of  2 variables:
##  $ data_wrangling_final            : Ord.factor w/ 5 levels "NN"<"PA"<"CR"<..: 1 2 3 4 5
##  $ mean_years_experience_data_field: num  0.7 3.3 3.7 5.8 4.5
data_frame_experience
# join data_frame_experience onto data_wrangling_report_2
data_wrangling_report_complete <- data_wrangling_report_2 %>% 
  inner_join(data_frame_experience, by = "data_wrangling_final")
data_wrangling_report_complete

Explanation of actions taken:

Reference List