Your task is to choose one dataset, then study the data and its associated description of the data (i.e. “data dictionary”). You should take the data, and create an R data frame with a subset of the columns (and if you like rows) in the dataset. Your deliverable is the R code to perform these transformation tasks.
I chose the flag data from UCI data sets. It contains the country names, and their various flag attributes. I have taken a subset of Eurpe and Africa/Asia landmass regions, and columns that contain number of bars and strips within a flag.
library(dplyr) # Selecting and filtering
library(magrittr) # Pipes
library(knitr) # kable
# Read the flags.data file online
flags_fulldump_df <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data", header= FALSE, sep=",")
# Add column names (obtained from data dictionary)
colnames(flags_fulldump_df) <- c("name", "landmass", "zone",
"area", "population", "language",
"religion", "bars", "stripes",
"colours", "red", "green",
"blue", "gold", "white",
"black", "orange", "mainhue",
"circles", "crosses", "saltires",
"quarters", "sunstars", "crescent",
"triangle", "icon", "animate",
"text", "topleft", "botright")
# Find the structure of flags data frame
str(flags_fulldump_df)
## 'data.frame': 194 obs. of 30 variables:
## $ name : Factor w/ 194 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ landmass : int 5 3 4 6 3 4 1 1 2 2 ...
## $ zone : int 1 1 1 3 1 2 4 4 3 3 ...
## $ area : int 648 29 2388 0 0 1247 0 0 2777 2777 ...
## $ population: int 16 3 20 0 0 7 0 0 28 28 ...
## $ language : int 10 6 8 1 6 10 1 1 2 2 ...
## $ religion : int 2 6 2 1 0 5 1 1 0 0 ...
## $ bars : int 0 0 2 0 3 0 0 0 0 0 ...
## $ stripes : int 3 0 0 0 0 2 1 1 3 3 ...
## $ colours : int 5 3 3 5 3 3 3 5 2 3 ...
## $ red : int 1 1 1 1 1 1 0 1 0 0 ...
## $ green : int 1 0 1 0 0 0 0 0 0 0 ...
## $ blue : int 0 0 0 1 1 0 1 1 1 1 ...
## $ gold : int 1 1 0 1 1 1 0 1 0 1 ...
## $ white : int 1 0 1 1 0 0 1 1 1 1 ...
## $ black : int 1 1 0 0 0 1 0 1 0 0 ...
## $ orange : int 0 0 0 1 0 0 1 0 0 0 ...
## $ mainhue : Factor w/ 8 levels "black","blue",..: 5 7 5 2 4 7 8 7 2 2 ...
## $ circles : int 0 0 0 0 0 0 0 0 0 0 ...
## $ crosses : int 0 0 0 0 0 0 0 0 0 0 ...
## $ saltires : int 0 0 0 0 0 0 0 0 0 0 ...
## $ quarters : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sunstars : int 1 1 1 0 0 1 0 1 0 1 ...
## $ crescent : int 0 0 1 0 0 0 0 0 0 0 ...
## $ triangle : int 0 0 0 1 0 0 0 1 0 0 ...
## $ icon : int 1 0 0 1 0 1 0 0 0 0 ...
## $ animate : int 0 1 0 1 0 0 1 0 0 0 ...
## $ text : int 0 0 0 0 0 0 0 0 0 0 ...
## $ topleft : Factor w/ 7 levels "black","blue",..: 1 6 4 2 2 6 7 1 2 2 ...
## $ botright : Factor w/ 8 levels "black","blue",..: 5 7 8 7 7 1 2 7 2 2 ...
# Display few lines of df
kable(head (flags_fulldump_df))
| Afghanistan |
5 |
1 |
648 |
16 |
10 |
2 |
0 |
3 |
5 |
1 |
1 |
0 |
1 |
1 |
1 |
0 |
green |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
black |
green |
| Albania |
3 |
1 |
29 |
3 |
6 |
6 |
0 |
0 |
3 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
red |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
red |
red |
| Algeria |
4 |
1 |
2388 |
20 |
8 |
2 |
2 |
0 |
3 |
1 |
1 |
0 |
0 |
1 |
0 |
0 |
green |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
green |
white |
| American-Samoa |
6 |
3 |
0 |
0 |
1 |
1 |
0 |
0 |
5 |
1 |
0 |
1 |
1 |
1 |
0 |
1 |
blue |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
blue |
red |
| Andorra |
3 |
1 |
0 |
0 |
6 |
0 |
3 |
0 |
3 |
1 |
0 |
1 |
1 |
0 |
0 |
0 |
gold |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
blue |
red |
| Angola |
4 |
2 |
1247 |
7 |
10 |
5 |
0 |
2 |
3 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
red |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
red |
black |
# Create subset containing columns: landmass(Europe & Africa/Asia), country name, bars and stripes
flags_subset_df <- flags_fulldump_df %>%
select(landmass, name, bars, stripes) %>%
filter(landmass %in% c(3,4))
kable(head(flags_subset_df))
| 3 |
Albania |
0 |
0 |
| 4 |
Algeria |
2 |
0 |
| 3 |
Andorra |
3 |
0 |
| 4 |
Angola |
0 |
2 |
| 3 |
Austria |
0 |
3 |
| 3 |
Belgium |
3 |
0 |
# Set all zero values to NA so that they will not counted when aggregating count of bars and stripes
flags_subset_df[flags_subset_df == 0] <- NA
kable(head(flags_subset_df))
| 3 |
Albania |
NA |
NA |
| 4 |
Algeria |
2 |
NA |
| 3 |
Andorra |
3 |
NA |
| 4 |
Angola |
NA |
2 |
| 3 |
Austria |
NA |
3 |
| 3 |
Belgium |
3 |
NA |
# Group the values by Europe and Africa/Asia, then count the bars and stripes
flags_sum_df <- flags_subset_df %>%
group_by(landmass) %>%
summarise(totalCountries = sum(!is.na(name)),
totalBars = sum(!is.na(bars)),
totalStripes=sum(!is.na(stripes)))
kable(head(flags_sum_df))
# Add percentages
flags_percent_df <- flags_sum_df %>%
mutate(percentBars = round(totalBars/totalCountries,digits=2),
percentStripes = round(totalStripes/totalCountries,digits=2),
percentNone = round((totalCountries-(totalBars+totalStripes))/totalCountries, digits=2))
# Add rownames
rownames(flags_percent_df) <- c("Europe", "Africa/Asia")
#kable(flags_percent_df[,c(5,6,7)])
kable(flags_percent_df %>% select(percentBars, percentStripes, percentNone))
| Europe |
0.23 |
0.43 |
0.34 |
| Africa/Asia |
0.29 |
0.50 |
0.21 |
** It appears stripes are common in majority of Europe and Africa/Asia country flags. **