Wide, or unstacked data is presented with each different data variable in a separate column. “https://en.wikipedia.org/wiki/Wide_and_narrow_data”
Untidy data is messy or poorly formatted which makes is harder to do analysis.
Each variable in the dataset is placed in its own column
Each observation is placed in its own row
Each value is placed in its own cell
source: http://garrettgman.github.io/tidying/
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(zoo)
## Warning: package 'zoo' was built under R version 3.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(stringr)
library(rio)
## Warning: package 'rio' was built under R version 3.3.3
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(bitops)
The file can be downloaded via this link:
https://bbhosted.cuny.edu/courses/1/SPS01_DATA_607_01_1172_1/db/_22605542_1/Discussion5.csv
#This code reads the Discussion5.csv file into data1 object.
data1<- read.csv("https://raw.githubusercontent.com/excelsiordata/DATA607/master/Discussion5.csv", na.strings=c("","NA"), stringsAsFactors = FALSE)
# An additional argument is added to label all blank rows as "NA" in order to use the na.locf function
##1
# First we need to repeat the date for each Carrier item. Thats is for each Carrier repeating the date for the shipping fee.
# By using the mutate function we can update all rows missing a date value that we earlier labeled "NA" with the previous non missing value. We do this by using the na.locf function.
data1.1<- mutate(data1, Date = na.locf(Date))
##2
#Second need to make the dataset into a long instead of wide format. This is done using the gather function to convert columns into rows.
##3
#Third we need to ensure that each column only contains 1 variable. Notice that the values column has both price and shipping fees so we need to change that to make the dataset tidy. We can split these into 2 different columns by using the spread function.
# Lastly we can order the dataset by country in alphabetical order using the arrange function.
data1.2<-
data1.1 %>%
gather("Country", "Amount", 3:10) %>%
spread(Values, Amount) %>%
arrange(Country)
#Analysis 1 - Percentage of tax charged
#Question: are we collecting enough fees vs what the company is charging
#*********************************
#analysis1 <- data1.2 %>%
#mutate(Percentage = `Price of Carrier`/`Shipping Fees Collected`)
#Analysis 2 - Average shipping cost per month
#analysis<- data1.2 %>%
#Analysis 3 - Average shipping cost per month by country
Table was copied from https://www.bls.gov/news.release/empsit.t19.htm and pasted into an excel sheet. It was then saved as a comma separated file “data2”
# Create an object data2 and store the information from the csv file in it
# Remove table title information by starting reading the from line 3 of the csv file. Set parameter skip = 2.
# Set all blank values to NA using na.strings argument
data2<- read.csv("https://raw.githubusercontent.com/excelsiordata/DATA607/master/data2.csv", skip = 2, stringsAsFactors = FALSE)
# Remove footnotes and note information which is not relevant to the table for analysis.
#Can do this by using the slice function to subset only those rows we want.
data2<- slice(data2, 1:21)
#First create a new table for Average hourly earnings by selecting the columns for that using the select function then save it as its own table
data2.1<- select(data2, Industry, Average.hourly.earnings, X, X.1, X.2)
# Do the same process to create a new table for Average weekly earnings
data2.2<- select(data2, -Average.hourly.earnings, -X, -X.1, -X.2)
#rename the column headings combining the 2 separate values of month and year
colnames(data2.1) <- c("Industry", "Feb 2016", "Dec 2016", "Jan 2017(p)", "Feb 2017(p)")
data2hrly<-
data2.1 %>%
slice(3:21) %>% #removes the 2 rows contained the column heading info
gather("Date", "Average hourly earnings in dollars", 2:5) #convert columns into rows for tidyness
#rename the column headings combining the 2 separate values of month and year
colnames(data2.2) <- c("Industry", "Feb 2016", "Dec 2016", "Jan 2017(p)", "Feb 2017(p)")
data2wkly<-
data2.2 %>%
slice(3:21) %>% #removes the 2 rows contained the column heading info
gather("Date", "Average weekly earnings in dollars", 2:5) #convert columns into rows for tidyness
#Merge data2hrly with data2wkly using inner join and with a composite key where the industry type and date make up the composite key
data2final<- inner_join(data2hrly,data2wkly, by = c("Industry" = "Industry", "Date" ="Date"))
#this doesn't seem to do anything, not working like it should
data2final$`Average hourly earnings in dollars` <- gsub("$", "", data2final$`Average hourly earnings in dollars`)
data2final$`Average weekly earnings in dollars` <- gsub("$", "", data2final$`Average weekly earnings in dollars`)
#this deletes random values too if run
data2final$`Average weekly earnings in dollars` <- str_replace_all((data2final$`Average weekly earnings in dollars`), fixed("$"), "")
data2final$`Average hourly earnings in dollars` <- str_replace_all((data2final$`Average hourly earnings in dollars`), fixed("$"), "")
#this deletes random values if run
data2final$`Average hourly earnings in dollars` <- as.numeric(data2final$`Average hourly earnings in dollars`)
data2final$`Average weekly earnings in dollars` <- as.numeric(data2final$`Average weekly earnings in dollars`)
## Warning: NAs introduced by coercion
head(data2final)
## Industry Date Average hourly earnings in dollars
## 1 Total private Feb 2016 25.38
## 2 Goods-producing Feb 2016 26.54
## 3 Mining and logging Feb 2016 31.64
## 4 Construction Feb 2016 27.74
## 5 Manufacturing Feb 2016 25.62
## 6 Durable goods Feb 2016 26.96
## Average weekly earnings in dollars
## 1 875.61
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
The third data set we’ll look at is a subset of a “Women in development” data set that I found on in the World Development Indicators site that a classmate Kyle posted. I decided to exclusively study life expectancy at birth by country and gender. It’s untidy, so we’ll clean it up.
#Load in all the data
x <- getURL("https://raw.githubusercontent.com/excelsiordata/DATA607/master/Women_in_development.csv")
WID <- read.csv(text = x, head=TRUE, sep=",", stringsAsFactors=FALSE, skip = 3, col.names = c("Country", "Male.Life.Expectancy.in.Years", "Female.Life.Expectancy.in.Years", "% Women Married by 18", "Male Fin Acct", "Fem Fin Acct", "% Male Salaried Employment", "% Fem Salaried Employment", "Fem PT Employment", "Firms w Fem Ownership", "Fem leg, sen off, and mangrs", "Women in parliaments", "Nondisc clause mentions gender in const"))
#Create the data frame
LEbyCountry <- data.frame(WID$Country, WID$Male.Life.Expectancy.in.Years, WID$Female.Life.Expectancy.in.Years)
#Take a look at the data and make sure everything loaded in properly
head(LEbyCountry)
## WID.Country WID.Male.Life.Expectancy.in.Years
## 1 Afghanistan 59.2
## 2 Albania 75.4
## 3 Algeria 72.5
## 4 American Samoa ..
## 5 Andorra ..
## 6 Angola 50.8
## WID.Female.Life.Expectancy.in.Years
## 1 61.6
## 2 80.4
## 3 77.2
## 4 ..
## 5 ..
## 6 53.8
#Put gender into its own column, and life expectancy into its own column
LEbyCandG <- data.frame(gather(LEbyCountry, "Gender", "Life.Expectancy", 2:3))
## Warning: attributes are not identical across measure variables; they will
## be dropped
#Take a look and make sure that worked properly
head(LEbyCandG)
## WID.Country Gender Life.Expectancy
## 1 Afghanistan WID.Male.Life.Expectancy.in.Years 59.2
## 2 Albania WID.Male.Life.Expectancy.in.Years 75.4
## 3 Algeria WID.Male.Life.Expectancy.in.Years 72.5
## 4 American Samoa WID.Male.Life.Expectancy.in.Years ..
## 5 Andorra WID.Male.Life.Expectancy.in.Years ..
## 6 Angola WID.Male.Life.Expectancy.in.Years 50.8
tail(LEbyCandG)
## WID.Country Gender
## 439 Europe & Central Asia WID.Female.Life.Expectancy.in.Years
## 440 Latin America & Caribbean WID.Female.Life.Expectancy.in.Years
## 441 Middle East & North Africa WID.Female.Life.Expectancy.in.Years
## 442 North America WID.Female.Life.Expectancy.in.Years
## 443 South Asia WID.Female.Life.Expectancy.in.Years
## 444 Sub-Saharan Africa WID.Female.Life.Expectancy.in.Years
## Life.Expectancy
## 439 80.4
## 440 78.3
## 441 74.8
## 442 81.7
## 443 69.6
## 444 59.9
#Gender is messy, let's clean it up
LEbyCandG[LEbyCandG=="WID.Male.Life.Expectancy.in.Years"] <- "Male"
LEbyCandG[LEbyCandG=="WID.Female.Life.Expectancy.in.Years"] <- "Female"
#Change the life expectancy column from char to dbl
LEbyCandG$Life.Expectancy <- as.numeric(LEbyCandG$Life.Expectancy)
## Warning: NAs introduced by coercion
#Get rid of countries with no life expectancy data
LEbyCandG <- LEbyCandG[rowSums(is.na(LEbyCandG))==0, ]
#Take a look and make sure all of that worked properly
head(LEbyCandG)
## WID.Country Gender Life.Expectancy
## 1 Afghanistan Male 59.2
## 2 Albania Male 75.4
## 3 Algeria Male 72.5
## 6 Angola Male 50.8
## 7 Antigua and Barbuda Male 73.5
## 8 Argentina Male 72.4
tail(LEbyCandG)
## WID.Country Gender Life.Expectancy
## 439 Europe & Central Asia Female 80.4
## 440 Latin America & Caribbean Female 78.3
## 441 Middle East & North Africa Female 74.8
## 442 North America Female 81.7
## 443 South Asia Female 69.6
## 444 Sub-Saharan Africa Female 59.9
#Let's see what the life expectancy is by gender
LEbyCandG %>%
group_by(Gender) %>%
summarise(Mean=mean(Life.Expectancy))
## # A tibble: 2 × 2
## Gender Mean
## <chr> <dbl>
## 1 Female 73.91005
## 2 Male 69.14019
#Plot the data
plot(LEbyCandG$WID.Country, LEbyCandG$Life.Expectancy, xlab = "Country", ylab = "Life Expectancy in Years", main = "Life Expectancy by Country and Gender", frame.plot = FALSE, col = ifelse(LEbyCandG$Gender=="Female", "magenta", "blue"))