I have gotten tired of this, so will just use the datasets I used in PS4… Gapminder Data http://www.gapminder.org/data/
I investigated 3 datasets - Cell Phones, Internet, and PC Per 100 People
I created a function createTidy(df, varname = "DataName") that created a tidy dataframe from Gapminder data.
# The Gapminder website contains over 500 data sets with information about
# the world's population. Your task is to continue the investigation you did at the
# end of Problem Set 4 or you can start fresh and choose a different
# data set from Gapminder.
# If you're feeling adventurous or want to try some data munging see if you can
# find a data set or scrape one from the web.
# In your investigation, examine 3 or more variables and create 2-5 plots that make
# use of the techniques from Lesson 5.
# You can find a link to the Gapminder website in the Instructor Notes.
# Once you've completed your investigation, create a post in the discussions that includes:
# 1. the variable(s) you investigated, your observations, and any summary statistics
# 2. snippets of code that created the plots
# 3. links to the images of your plots
# Copy and paste all of the code that you used for
# your investigation, and submit it when you are ready.
# ============================================================================================
Year: Cell phones per 100 had data since 1965, but didn’t have non-zero values until 1980 (Finland), whereas the other counts didn’t start until 1990. PC count only went up to 2006, while Internet and Cell Phone counts went until 2011. So in my combined data set I only went from 1990 to 2006. Country: Internet and Cell had 275 unique countries. PC only had 196 unique countries. I picked 16 at random for most of my studies.
This data was like the yogurt data in that we had observations (of the same country) over time, not like Facebook data which was a snapshot of data at a single point in time. Observations:
library(tidyr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(GGally)
dfcell <- read.csv('C:/Users/Susan/Documents/DataAnalystNanoDegree/DataAnalysisWithR/safeCellPhonePer100.csv', header = TRUE, row.names = 1)
dfpc <- read.csv('C:/Users/Susan/Documents/DataAnalystNanoDegree/DataAnalysisWithR/savePcPer100.csv', header = TRUE, row.names = 1)
dfinternet <- read.csv('C:/Users/Susan/Documents/DataAnalystNanoDegree/DataAnalysisWithR/safeInternetPer100.csv', header = TRUE, row.names = 1)
need to use gather_ to be able to provide variable as value_col
createTidy <- function(df, varname = "DataName"){
numcols <- ncol(df)
colnames(df) <- substr(colnames(df[1:numcols]),2,5)
df['country'] <- row.names(df)
tidydf <- gather_(df, "year", value_col = varname, 1:numcols)
#tidydf$year <- as.numeric(as.character(tidydf$year))
tidydf$year <- as.character(tidydf$year)
#http://www.cookbook-r.com/Manipulating_data/Converting_between_vector_types/
return(tidydf)
}
tidycell <- createTidy(dfcell,"CellPhonePer100")
summary(tidycell)
## country year CellPhonePer100
## Length:12925 Length:12925 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.005
## Mean : 17.435
## 3rd Qu.: 12.096
## Max. :243.498
## NA's :5238
tidypc <- createTidy(dfpc, "PCPer100")
summary(tidypc)
## country year PCPer100
## Length:3528 Length:3528 Min. : 0.0100
## Class :character Class :character 1st Qu.: 0.7475
## Mode :character Mode :character Median : 3.4400
## Mean :10.1256
## 3rd Qu.:12.3800
## Max. :94.5800
## NA's :1332
tidyinternet <- createTidy(dfinternet, "InternetPer100")
summary(tidyinternet)
## country year InternetPer100
## Length:6050 Length:6050 Min. : 0.0000
## Class :character Class :character 1st Qu.: 0.2687
## Mode :character Mode :character Median : 3.6009
## Mean :15.3103
## 3rd Qu.:22.1847
## Max. :96.6184
## NA's :2376
set.seed(1836)
countryList <- unique(tidycell$country)
# sample country list for Cell Per 100 (more countries than in combined dataset)
sampleCountryList <- sample(countryList, 16, replace = FALSE)
all_years <- unique(c(tidycell$years, tidypc$years, tidyinternet$years))
all_countries <- unique(c(tidycell$country, tidypc$country, tidyinternet$country))
total <- merge(tidycell,tidypc,by=c("year","country"))
total <- merge(total, tidyinternet, by = c("year", "country"))
#get rid of NA's
no_na_total <- na.omit(total)
# no_na_total is the dataset I use that gets rid of NA's and has countries and years for all 3 datasets
# it does decrease the number of countries to 180 and 17 years (1990-2006)
summary(no_na_total)
## year country CellPhonePer100 PCPer100
## Length:1929 Length:1929 Min. : 0.0000 Min. : 0.01
## Class :character Class :character 1st Qu.: 0.7103 1st Qu.: 0.88
## Mode :character Mode :character Median : 5.9810 Median : 3.83
## Mean : 20.7751 Mean :10.72
## 3rd Qu.: 29.1046 3rd Qu.:12.91
## Max. :153.1403 Max. :94.58
## InternetPer100
## Min. : 0.0000
## 1st Qu.: 0.2379
## Median : 1.9761
## Mean :10.0020
## 3rd Qu.:10.9226
## Max. :88.6932
nntc <- unique(no_na_total$country)
length(nntc)
## [1] 180
meltednna <- melt(no_na_total, id = c(1,2), measure = 3:5, "Data")
#Look at all variables together picking 6 random countries
set.seed(124)
ggpairs(no_na_total[no_na_total$country %in% sample(all_countries, 6, replace = FALSE),])
I want to compare different countries - pick 16 at random and compare
set.seed(287)
nntcSample <- sample(nntc, 16, replace = FALSE)
theme_set(theme_minimal(20))
#These are my random countries
nntcSample
## [1] "Rwanda" "El Salvador" "Cyprus" "France" "Sri Lanka"
## [6] "New Zealand" "Bulgaria" "Austria" "Cuba" "Mexico"
## [11] "Bahrain" "Uzbekistan" "Vietnam" "Ghana" "Eritrea"
## [16] "Jamaica"
#no_na_total[no_na_total$country %in% nntcSample,]
ggplot(data = no_na_total[no_na_total$country %in% nntcSample,],
aes(x = year)) +
facet_wrap(~country) +
geom_point(aes(y = CellPhonePer100), colour = "red", shape = 16, fill = "red", size = 3, alpha = .5) +
geom_point(aes(y = InternetPer100), colour = "black", shape = 17, fill = "black", size = 3, alpha = .5) +
geom_point(aes(y = PCPer100), colour = "green", shape = 18, fill = "green", size = 3, alpha = .5) +
theme(axis.text.x = element_text(angle=45)) +
scale_x_discrete(breaks = c(1990, 1995, 2000, 2005, 2010)) +
labs(title="Cell Phone, Internet, and PC Use per 100 People")
whole dataset was melted and had only one data column instead of 3. I want to use a heatmap!
Can I make each row a country and each column a year?
Need to melt as matrix where each row is CellPhonePer100 and each column is a year from 1965 to 2011
ggplot(aes(y = country, x = year, fill = CellPhonePer100),
data = tidycell[(tidycell$country %in% nntcSample) & (tidycell$year>1975),]) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100)) +
theme(axis.text.x = element_text(angle=45)) +
scale_x_discrete(breaks = c(1975, 1980, 1985,1990, 1995, 2000, 2005, 2010)) +
labs(title="Cell Phone Use per 100 People")
p1 <-ggplot(data = no_na_total, aes(x = year, y = CellPhonePer100 )) +
geom_jitter(alpha = .2,shape = 21, fill = I('#F79420')) +
theme(axis.text.x = element_text(angle=45))
You can notice a general trend of increased cell phone penetration per 100 people as years go on. Some countries have more phones than people! The dark clusters near 0 phones per 100 have gone away and by 2006 it looks like they are almost gone. So cell phones have penetrated the world! Internet Use has not gone up as high, but it seems to be greater than 0 almost everywhere. PC use was much higher in 1990 when my dataset starts.
p2<-ggplot(data = no_na_total, aes(x = year, y = InternetPer100 )) +
geom_jitter(alpha = .2,shape = 21, fill = I('#F79420')) +
theme(axis.text.x = element_text(angle=45))
p3<-ggplot(data = no_na_total, aes(x = year, y = PCPer100 )) +
geom_jitter(alpha = .2,shape = 21, fill = I('#F79420')) +
theme(axis.text.x = element_text(angle=45))
library(gridExtra)
## Loading required package: grid
grid.arrange(p1,p2,p3, nrow = 1)
ggplot(data = meltednna, aes(x = year, y = value)) +
facet_wrap(~Data) +
geom_jitter(alpha = .2,shape = 21, fill = I('#F79420')) +
theme(axis.text.x = element_text(angle=45))
ggplot(data = meltednna, aes(x = year, y = value)) +
geom_jitter(alpha = .3, aes(shape = Data, colour = Data)) +
theme(axis.text.x = element_text(angle=45)) +
labs(title="Cell Phone, PC, and Internet Use per 100 People")