Choose any three of the “wide” datasets identified in the Week 5 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!) For each of the three chosen datasets:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(gapminder)
library(ggplot2)
Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. You’re encouraged to use a “wide” structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below.
#load the forest area data
Ecology<- read.csv("https://www.rforecology.com/data/pct_forest_world.csv")
#Edit column names
colnames(Ecology) <- sub("X", "", colnames(Ecology))
head(Ecology)
## Country.Name Country.Code Indicator.Name
## 1 Aruba ABW Forest area (% of land area)
## 2 Africa Eastern and Southern AFE Forest area (% of land area)
## 3 Afghanistan AFG Forest area (% of land area)
## 4 Africa Western and Central AFW Forest area (% of land area)
## 5 Angola AGO Forest area (% of land area)
## 6 Albania ALB Forest area (% of land area)
## Indicator.Code 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
## 1 AG.LND.FRST.ZS NA NA NA NA NA NA NA NA NA NA NA NA
## 2 AG.LND.FRST.ZS NA NA NA NA NA NA NA NA NA NA NA NA
## 3 AG.LND.FRST.ZS NA NA NA NA NA NA NA NA NA NA NA NA
## 4 AG.LND.FRST.ZS NA NA NA NA NA NA NA NA NA NA NA NA
## 5 AG.LND.FRST.ZS NA NA NA NA NA NA NA NA NA NA NA NA
## 6 AG.LND.FRST.ZS NA NA NA NA NA NA NA NA NA NA NA NA
## 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986
## 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## 1987 1988 1989 1990 1991 1992 1993 1994 1995
## 1 NA NA NA 2.333333 2.333333 2.333333 2.333333 2.333333 2.333333
## 2 NA NA NA 40.565912 40.403946 40.235807 40.269690 40.103270 39.936851
## 3 NA NA NA 1.850994 1.850994 1.850994 1.850994 1.850994 1.850994
## 4 NA NA NA 22.776908 22.658746 22.540583 22.422421 22.304258 22.186096
## 5 NA NA NA 63.578070 63.453407 63.328745 63.204082 63.079419 62.954757
## 6 NA NA NA 28.788321 28.717153 28.645985 28.574818 28.503650 28.432482
## 1996 1997 1998 1999 2000 2001 2002
## 1 2.333333 2.333333 2.333333 2.333333 2.333333 2.333333 2.333333
## 2 39.770431 39.604012 39.437592 39.271172 33.980631 38.903980 38.703207
## 3 1.850994 1.850994 1.850994 1.850994 1.850994 1.850994 1.850994
## 4 22.067933 21.949771 21.831608 21.713446 21.595284 21.503156 21.411029
## 5 62.830094 62.705431 62.580768 62.456106 62.331443 61.886219 61.440995
## 6 28.361314 28.290146 28.218978 28.147810 28.076642 28.123248 28.169854
## 2003 2004 2005 2006 2007 2008 2009
## 1 2.333333 2.333333 2.333333 2.333333 2.333333 2.333333 2.333333
## 2 38.502435 37.899951 37.701120 37.502465 37.303880 37.105259 36.906596
## 3 1.850994 1.850994 1.850994 1.850994 1.850994 1.850994 1.850994
## 4 21.318902 21.226775 21.134648 21.042520 20.950393 20.858266 20.766139
## 5 60.995770 60.550546 60.105322 59.660098 59.214874 58.769649 58.324425
## 6 28.216460 28.263066 28.309672 28.356277 28.402883 28.449489 28.496095
## 2010 2011 2012 2013 2014 2015 2016
## 1 2.333333 2.333333 2.333333 2.333333 2.333333 2.333333 2.333333
## 2 31.910878 32.183135 31.972492 31.761842 31.551270 31.340526 31.119431
## 3 1.850994 1.850994 1.850994 1.850994 1.850994 1.850994 1.850994
## 4 20.674012 20.586055 20.498099 20.410143 20.322187 20.234231 20.152610
## 5 57.879201 57.433976 56.988751 56.543526 56.098301 55.653076 55.207845
## 6 28.542701 28.594653 28.646606 28.698558 28.750511 28.802464 28.802190
## 2017 2018 2019 2020
## 1 2.333333 2.333333 2.333333 2.333333
## 2 30.903543 30.690224 30.469798 30.251786
## 3 1.850994 1.850994 1.850994 1.850994
## 4 20.071327 19.986100 19.901215 19.816762
## 5 54.762629 54.317406 53.872175 53.426951
## 6 28.792062 28.791971 28.791971 28.791971
# Convert a long format table
Ecology_long <- pivot_longer(Ecology, cols = "1988":"1995", names_to = "year",
values_to = "pct_forest_area", values_drop_na = TRUE)
head(Ecology_long)
## # A tibble: 6 × 59
## Country.Name Country.Code Indicator.Name Indicator.Code `1960` `1961` `1962`
## <chr> <chr> <chr> <chr> <lgl> <lgl> <lgl>
## 1 Aruba ABW Forest area (% … AG.LND.FRST.ZS NA NA NA
## 2 Aruba ABW Forest area (% … AG.LND.FRST.ZS NA NA NA
## 3 Aruba ABW Forest area (% … AG.LND.FRST.ZS NA NA NA
## 4 Aruba ABW Forest area (% … AG.LND.FRST.ZS NA NA NA
## 5 Aruba ABW Forest area (% … AG.LND.FRST.ZS NA NA NA
## 6 Aruba ABW Forest area (% … AG.LND.FRST.ZS NA NA NA
## # ℹ 52 more variables: `1963` <lgl>, `1964` <lgl>, `1965` <lgl>, `1966` <lgl>,
## # `1967` <lgl>, `1968` <lgl>, `1969` <lgl>, `1970` <lgl>, `1971` <lgl>,
## # `1972` <lgl>, `1973` <lgl>, `1974` <lgl>, `1975` <lgl>, `1976` <lgl>,
## # `1977` <lgl>, `1978` <lgl>, `1979` <lgl>, `1980` <lgl>, `1981` <lgl>,
## # `1982` <lgl>, `1983` <lgl>, `1984` <lgl>, `1985` <lgl>, `1986` <lgl>,
## # `1987` <lgl>, `1996` <dbl>, `1997` <dbl>, `1998` <dbl>, `1999` <dbl>,
## # `2000` <dbl>, `2001` <dbl>, `2002` <dbl>, `2003` <dbl>, `2004` <dbl>, …
# Filter out all rows for China
China <- filter(Ecology_long, Country.Name == "China")
# Create a scatter plot
plot(data = China, pct_forest_area ~ year, pch = 16, col = "blue", xlab = "Year", ylab = "Percentage of Forest Area", main = "China")
# Create a wide format
Ecology_wide <- pivot_wider(data = Ecology_long, id_cols = c("Country.Name", "Country.Code"),
names_from = "year", values_from = "pct_forest_area")
head(Ecology_wide)
## # A tibble: 6 × 8
## Country.Name Country.Code `1990` `1991` `1992` `1993` `1994` `1995`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba ABW 2.33 2.33 2.33 2.33 2.33 2.33
## 2 Africa Eastern and Sou… AFE 40.6 40.4 40.2 40.3 40.1 39.9
## 3 Afghanistan AFG 1.85 1.85 1.85 1.85 1.85 1.85
## 4 Africa Western and Cen… AFW 22.8 22.7 22.5 22.4 22.3 22.2
## 5 Angola AGO 63.6 63.5 63.3 63.2 63.1 63.0
## 6 Albania ALB 28.8 28.7 28.6 28.6 28.5 28.4
The data named forest area as a percent of total land area for each country from the World Bank’s Open Data Catalogue.The data is currently in wide format, take out the “X” that’s in front of every single year before reshaping the wide data into longer format to analysis. In the long data format, the country name occurs many times in the first column. Each row of the data table is one year’s measurement of %forest area in a certain country. For example, plot the data for %forest cover over years in China. Then, the data is widen again and spread it out instead of gathering it into a longer form that contain essential identifying information for each observation.
hsb2 <- read.csv('https://stats.idre.ucla.edu/stat/r/faq/hsb2.csv', header=T, sep=",")
long <- reshape(hsb2,
varying = c("read", "write", "math", "science", "socst"),
v.names = "score",
timevar = "subj",
times = c("read", "write", "math", "science", "socst"),
new.row.names = 1:1000,
direction = "long")
## Long to wide
wide_format <- reshape(long,
timevar = "subj",
idvar = c("id", "female", "race", "ses", "schtyp", "prog"),
direction = "wide")
dim(wide_format)
## [1] 200 11
dim(hsb2)
## [1] 200 11
The second data set is in wide format which is measured using five metrics: read, write, math, science and sorts. The dimensions of our wide data set to our original dataset are the same.
The third data set is to find the relationship between GDP per capita in 1987 and 2007. across all the countries in the dataset. To achieve this, the data frame is needed to change a wide form of the table, where each reps is a country and each column a year , with a values of gdpPercap in each cell of the table.
# Create Gapminder dataset to a CSV file
write.csv(gapminder, file = "gapminder.csv", row.names = FALSE)
head(gapminder)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
#Reshape the long format to wide form
gdp_wide <- gapminder %>%
# select only the columns we're interested in
select(country, year, gdpPercap) %>%
# use pivot_wider to go from long to wide format
pivot_wider(names_from = "year",
names_prefix = "yr",
values_from = "gdpPercap")
gdp_wide
## # A tibble: 142 × 13
## country yr1952 yr1957 yr1962 yr1967 yr1972 yr1977 yr1982 yr1987 yr1992 yr1997
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 779. 821. 853. 836. 740. 786. 978. 852. 649. 635.
## 2 Albania 1601. 1942. 2313. 2760. 3313. 3533. 3631. 3739. 2497. 3193.
## 3 Algeria 2449. 3014. 2551. 3247. 4183. 4910. 5745. 5681. 5023. 4797.
## 4 Angola 3521. 3828. 4269. 5523. 5473. 3009. 2757. 2430. 2628. 2277.
## 5 Argent… 5911. 6857. 7133. 8053. 9443. 10079. 8998. 9140. 9308. 10967.
## 6 Austra… 10040. 10950. 12217. 14526. 16789. 18334. 19477. 21889. 23425. 26998.
## 7 Austria 6137. 8843. 10751. 12835. 16662. 19749. 21597. 23688. 27042. 29096.
## 8 Bahrain 9867. 11636. 12753. 14805. 18269. 19340. 19211. 18524. 19036. 20292.
## 9 Bangla… 684. 662. 686. 721. 630. 660. 677. 752. 838. 973.
## 10 Belgium 8343. 9715. 10991. 13149. 16672. 19118. 20980. 22526. 25576. 27561.
## # ℹ 132 more rows
## # ℹ 2 more variables: yr2002 <dbl>, yr2007 <dbl>
gdp_wide %>%
ggplot(aes(yr1987, yr2007)) +
geom_point() +
geom_abline() +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10")
#reverse reshape from wide to long format
gdp_wide %>%
pivot_longer(cols = yr1952:yr2007,
names_to = "year",
values_to = "gdpPercap")
## # A tibble: 1,704 × 3
## country year gdpPercap
## <fct> <chr> <dbl>
## 1 Afghanistan yr1952 779.
## 2 Afghanistan yr1957 821.
## 3 Afghanistan yr1962 853.
## 4 Afghanistan yr1967 836.
## 5 Afghanistan yr1972 740.
## 6 Afghanistan yr1977 786.
## 7 Afghanistan yr1982 978.
## 8 Afghanistan yr1987 852.
## 9 Afghanistan yr1992 649.
## 10 Afghanistan yr1997 635.
## # ℹ 1,694 more rows
A scatter plot illustrates the GDP per capita in the year 1987 on x-axis and that of the year 2007 on y-axis in terms of logarithmic scale. Points above the diagonal line represents countries where GDP per capita increased fro 1987 to 2007, where points below the diagonal line represent countries where it decreased.
References:
[1].https://www.rforecology.com.
[2].https://stats.oarc.ucla.edu.
[3].https://www.gapminder.org