install.packages(“tidyverse”)
setwd(“/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators”)
female <- read.csv(“female.csv”)
Determine that R does consider female to be data frame
is.data.frame(female)
head(female)
library(tidyverse)
female <- female %>% filter(Country.Name == “Brazil” |
Country.Name == “India” | Country.Name == “United States”)
head(female)
colnames(female)
rel_col <- which(colnames(female)==“Country.Name” |
colnames(female)== “X2014..YR2014.” | colnames(female)==
“X2017..YR2017.” | colnames(female) == “X2021..YR2021.” ) rel_col
female <- female[rel_col] %>% rename(“2014” = 2, “2017” = 3,
“2021” = 4) head(female)
transpose_f <- data.frame(t(female[-1])) colnames(transpose_f)
<- female[, 1] head(transpose_f)
print(sapply(transpose_f, class))
transpose_f\(Brazil =
as.numeric(transpose_f\)Brazil) transpose_f\(India = as.numeric(transpose_f\)India)
transpose_f\("United States" =
as.numeric(transpose_f\)“United States”) head(transpose_f)
summarise_each(transpose_f, list(mean))
transpose_f %>% summarise(across(c(Brazil, India, “United
States”), list(mean=mean, sd=sd)))
year <- c(2014, 2017,2021) ggplot(data=transpose_f, aes(x=year,
y=India, group=1)) + geom_line()+ geom_point()
setwd(“/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators-2”)
male <- read.csv(file = “male.csv”) male <- male %>%
filter(Country.Name == “Brazil” | Country.Name == “India” | Country.Name
== “United States”)
male <- male[rel_col] %>% rename(“2014” = 2, “2017” = 3, “2021”
= 4)
transpose_m <- data.frame(t(male[-1])) colnames(transpose_m) <-
male[, 1]
transpose_m\(Brazil =
as.numeric(as.character(transpose_m\)Brazil)) transpose_m\(India =
as.numeric(as.character(transpose_m\)India)) transpose_m\("United States" =
as.numeric(as.character((transpose_m\)“United States”)))
head(transpose_m)
transpose_m <- rename(transpose_m, “Brazil_m” = 1, “India_m” = 2,
“United_States_m” = 3)
head(transpose_m)
transpose_m <- rownames_to_column(transpose_m, var=“Year”)
transpose_f <- rownames_to_column(transpose_f, var=“Year”)
acct_owner_by_gender <- merge(x = transpose_m, y = transpose_f, by
= “Year”, all.x = TRUE) acct_owner_by_gender <-
rename(acct_owner_by_gender, “United_States” = 7)
head(acct_owner_by_gender)
gfg_plot <- ggplot(acct_owner_by_gender, aes(x=year)) +
geom_line(aes(y = India), color = “black”) + geom_line(aes(y = India_m),
color = “red”) + geom_line(aes(y = Brazil), color = “green”) +
geom_line(aes(y = Brazil_m), color = “blue”) + geom_line(aes(y =
United_States), color = “purple”) + geom_line(aes(y = United_States_m),
color = “violet”)
Part 1: (Cleaning up the final plot) The final plot is somewhat
misleading: the y-axis is titled “India”. Write down the code that will
change it to percentage ownership.
gfg_plot <- gfg_plot + ylab("percentage ownership")
gfg_plot
Part 2: (Drawing Conclusions) After drawing the plot that includes
how percentages change over time, use this to draw some (fairly basic)
conclusions on the how the ownership percentages have been changed. For
example, you can discuss whether males have more account ownership than
females and whether the data suggests that account ownership percentages
are on the rise.
print(“Based on the data and the plot that was generated for how
percentage ownership changes over time. We can conclude that there seems
to be almost a pivot, or shift in the overall data at roughly 2017. We
can dissect this by understanding the relationship between male and
female percentage ownership in each country. Starting with India, we can
see that from 2014 to 2017 both male and female account ownership
percentages were on the rise, however after that (from 2017 to 2020 and
beyond) it’s been on a decline with male ownership percentage declining
at a faster rate. However, for the most part males appear to have more
account ownership than females in India, until the ending of data (most
recent) where they appear to be the same.”) print(“”) print(“Moving on
to Brazil, we can see that males have had more account ownership than
females from 2014 all the way up until the current (latest) available
data. We can also see that for both males and females the account
ownership percentages have been on the rise. From 2014 to roughly 2017
there definitely was a rise, however, from 2017 to the latest, there was
a signifciantly greater rise (slope) in percentage ownership over time
than compared with the rise in 2014 to 2017.”) print(“”) print(“Lastly,
looking at the United States, we can see that the data is not as
straightforward as that of India and Brazil. We can see that from 2014
to some point in early-mid 2016 that females had more account ownership
than males. However, from that same early-mid 2016 point up until
somewhere late in 2017, males actually had more account ownership
percentages than females. We can also see that from 2014 to 2017, male
account ownership percentage was on the rise, whereas female account
ownership percentages were declining. However, from 2017 all the way to
the current (latest) data, male account ownership percentage has been on
the decline, and female account ownership percentage is on the rise.”)
print(“”) print(“To conclude, we can see that, in general, when
considering account percentage ownership, it has been on the rise and
there is a greater percentage ownership now (in the current data) when
compared to that of 2014. Also, except for the United States, it appears
that historically males have had greater account ownership percentages
than females based upon our data of Brazil and India.”)
Part 3: (Cherry Picking) Is it possible to say that account
percentages have been increasing if you disregard a certain year? Does
this change if you focus on a particular country and gender
combinations?
print(“”) print(“For the first part of the question where is asks if
we can conclude that account percentages have been increasing if we
disregard a certain year, this is not the case. We would need to
disregard multiple years for this to happen. For example, looking at the
United State’s data, we see that the male account ownership percentage
has been decreasing from 2017 all the way to the current (latest) data
which is past 2020. The same can been seen in the data for India’s male
data, where percentage ownernship has been decreasing from 2017 to the
current [latest] data as well (past 2020).”) print(“”)
print(“Technically yes, if we don’t consider the data for United States
male, United States female (decreasing from 2014-2017), India male, then
it appears the remaining data (India female, Brazil male, and Brazil
female) indicates that account ownership percentages have been
increasing overall. Or if we just want to focous on a particular
country, then Brazil will achieve that same result of an overall
increasing percentage ownership trend. We can’t do this with a gender
combination (i.e. all males or females in our data set) because the data
varies in each country.”)
#Part 4: (Summarzing the data yourself) Do a similar analysis using
the csv files: # 1: Account ownership at a financial institution or with
a mobile-money-service provider, primary education or less (% of
population ages 15+) # 2: Account ownership at a financial institution
or with a mobile-money-service provider, secondary education or more (%
of population ages 15+)
You should analyze exactly five countries. You are free to pick the
countries but your choices should not include the United States, India
nor Brazil. After a visual representation of the data is made, you
should have a paragraph or two summarizing your conclusions. The summary
should be about the length of half a page and no longer than a
page.
install.packages(“tidyverse”)
setwd(“/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators-5”)
primary <- read.csv(“primary.csv”)
Determine that R does consider primary to be data frame
is.data.frame(primary)
head(primary)
library(tidyverse)
primary <- primary %>% filter(Country.Name == “Canada” |
Country.Name == “China” | Country.Name == “United Kingdom” |
Country.Name == “Germany” | Country.Name == “Chile”)
head(primary)
colnames(primary)
rel_col <- which(colnames(primary)==“Country.Name” |
colnames(primary)== “X2014..YR2014.” | colnames(primary)==
“X2017..YR2017.” | colnames(primary) == “X2021..YR2021.” ) rel_col
primary <- primary[rel_col] %>% rename(“2014” = 2, “2017” = 3,
“2021” = 4) head(primary)
transpose_p <- data.frame(t(primary[-1])) colnames(transpose_p)
<- primary[, 1] head(transpose_p)
print(sapply(transpose_p, class))
transpose_p\(Canada =
as.numeric(transpose_p\)Canada) transpose_p\(China = as.numeric(transpose_p\)China)
transpose_p\("United Kingdom" =
as.numeric(transpose_p\)“United Kingdom”) transpose_p\("Germany" =
as.numeric(transpose_p\)“Germany”) transpose_p\("Chile" =
as.numeric(transpose_p\)“Chile”) head(transpose_p)
summarise_each(transpose_p, list(mean))
transpose_p %>% summarise(across(c(Canada, China, “United
Kingdom”, Germany, Chile), list(mean=mean, sd=sd)))
year <- c(2014, 2017,2021) ggplot(data=transpose_p, aes(x=year,
y=China, group=1)) + geom_line()+ geom_point()
setwd(“/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators-6”)
secondary <- read.csv(file = “secondary.csv”) secondary <-
secondary %>% filter(Country.Name == “Canada” | Country.Name ==
“China” | Country.Name == “United Kingdom” | Country.Name == “Germany” |
Country.Name == “Chile”)
secondary <- secondary[rel_col] %>% rename(“2014” = 2, “2017” =
3, “2021” = 4)
transpose_s <- data.frame(t(secondary[-1])) colnames(transpose_s)
<- secondary[, 1]
transpose_s\("Canada" =
as.numeric(as.character(transpose_s\)“Canada”)) transpose_s\("China" =
as.numeric(as.character(transpose_s\)“China”)) transpose_s\("United Kingdom" =
as.numeric(as.character(transpose_s\)“United Kingdom”))
transpose_s\("Germany" =
as.numeric(as.character(transpose_s\)“Germany”)) transpose_s\("Chile" =
as.numeric(as.character(transpose_s\)“Chile”))
head(transpose_s)
transpose_s <- rename(transpose_s, “Canada_s” = 1, “China_s” = 2,
“United_Kingdom_s” = 3, “Germany_s” = 4, “Chile_s” = 5)
head(transpose_s)
transpose_s <- rownames_to_column(transpose_s, var=“Year”)
transpose_p <- rownames_to_column(transpose_p, var=“Year”)
acct_owner_by_education <- merge(x = transpose_s, y = transpose_p,
by = “Year”, all.x = TRUE) acct_owner_by_education <-
rename(acct_owner_by_education, “United_Kingdom” = 7)
head(acct_owner_by_education)
ggplot(data = transpose_p, aes(x = year)) + geom_line(aes(y = China,
color = “China”)) + geom_line(aes(y = Canada, color = “Canada”)) +
geom_line(aes(y = United Kingdom, color = “United
Kingdom”)) + geom_line(aes(y = Germany, color = “Germany”)) +
geom_line(aes(y = Chile, color = “Chile”)) + geom_line(data =
transpose_s, aes(y = China_s, color = “China_s”)) + geom_line(data =
transpose_s, aes(y = Canada_s, color = “Canada_s”)) + geom_line(data =
transpose_s, aes(y = United_Kingdom_s, color = “United Kingdom_s”)) +
geom_line(data = transpose_s, aes(y = Germany_s, color = “Germany_s”)) +
geom_line(data = transpose_s, aes(y = Chile_s, color = “Chile_s”))
ylab(“Percentage Ownership”)
gfg_plot
ggplot
print(“Now that we are using five countries, the data (and plot)
certainly is more complicated. We must press the”zoom” button in order
to fully see our plot in its appropriate scale since the countries vary
drastically in their data in percentage ownership.”) print(“”) print(“We
can once again dissect our plot country by country to truly understand
it.”)
---
title: "Project #1 - Faiz Shaikh"
output: html_notebook
---

install.packages("tidyverse")

setwd("/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators")

female <- read.csv("female.csv")

# Determine that R does consider female to be data frame
is.data.frame(female)

head(female)

library(tidyverse)

female <- female %>%
  filter(Country.Name == "Brazil" | Country.Name == "India" | Country.Name == "United States")

head(female)

colnames(female)

rel_col <- which(colnames(female)=="Country.Name" | colnames(female)== "X2014..YR2014." | colnames(female)== "X2017..YR2017." | colnames(female) == "X2021..YR2021." )
rel_col

female <- female[rel_col] %>% rename("2014" = 2, "2017" = 3, "2021" = 4)
head(female)

transpose_f <- data.frame(t(female[-1]))
colnames(transpose_f) <- female[, 1]
head(transpose_f)

print(sapply(transpose_f, class))

transpose_f$Brazil = as.numeric(transpose_f$Brazil) 
transpose_f$India = as.numeric(transpose_f$India)
transpose_f$"United States" = as.numeric(transpose_f$"United States")
head(transpose_f)

summarise_each(transpose_f, list(mean))

transpose_f %>%
  summarise(across(c(Brazil, India, "United States"), list(mean=mean, sd=sd)))

year <- c(2014, 2017,2021)
ggplot(data=transpose_f, aes(x=year, y=India, group=1)) +
  geom_line()+
  geom_point()

setwd("/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators-2")

male <- read.csv(file = "male.csv")
male <- male %>%
  filter(Country.Name == "Brazil" | Country.Name == "India" | Country.Name == "United States") 

male <- male[rel_col] %>% 
  rename("2014" = 2, "2017" = 3, "2021" = 4)

transpose_m <- data.frame(t(male[-1]))
colnames(transpose_m) <- male[, 1]

transpose_m$Brazil = as.numeric(as.character(transpose_m$Brazil)) 
transpose_m$India = as.numeric(as.character(transpose_m$India))
transpose_m$"United States" = as.numeric(as.character((transpose_m$"United States")))

head(transpose_m)

transpose_m <- rename(transpose_m, "Brazil_m" = 1, "India_m" = 2, "United_States_m" = 3)

head(transpose_m)

transpose_m <- rownames_to_column(transpose_m, var="Year") 
transpose_f <- rownames_to_column(transpose_f, var="Year")

acct_owner_by_gender <- merge(x = transpose_m, y = transpose_f, by = "Year", all.x = TRUE)
acct_owner_by_gender <- rename(acct_owner_by_gender, "United_States" = 7)
head(acct_owner_by_gender)

gfg_plot <- ggplot(acct_owner_by_gender, aes(x=year)) +  
    geom_line(aes(y = India), color = "black") +
    geom_line(aes(y = India_m), color = "red") +
    geom_line(aes(y = Brazil), color = "green") +
    geom_line(aes(y = Brazil_m), color = "blue") +
    geom_line(aes(y = United_States), color = "purple") +
    geom_line(aes(y = United_States_m), color = "violet") 
    
    
# Part 1: (Cleaning up the final plot) The final plot is somewhat misleading: the y-axis is titled "India". Write down the code that will change it to percentage ownership.
    gfg_plot <- gfg_plot + ylab("percentage ownership")
    
gfg_plot


# Part 2: (Drawing Conclusions) After drawing the plot that includes how percentages change over time, use this to draw some (fairly basic) conclusions on the how the ownership percentages have been changed. For example, you can discuss whether males have more account ownership than females and whether the data suggests that account ownership percentages are on the rise.

print("Based on the data and the plot that was generated for how percentage ownership changes over time. We can conclude that there seems to be almost a pivot, or shift in the overall
 data at roughly 2017. We can dissect this by understanding the relationship between male and female percentage ownership in each country. Starting with India, we can see that 
 from 2014 to 2017 both male and female account ownership percentages were on the rise, however after that (from 2017 to 2020 and beyond) it's been on a decline with male ownership 
 percentage declining at a faster rate. However, for the most part males appear to have more account ownership than females in India, until the ending of data (most recent) where they
  appear to be the same.")
print("\n")
print("Moving on to Brazil, we can see that males have had more account ownership than females from 2014 all the way up until the current (latest) available data. We can also see
 that for both males and females the account ownership percentages have been on the rise. From 2014 to roughly 2017 there definitely was a rise, however, from 2017 to the latest, there
  was a signifciantly greater rise (slope) in percentage ownership over time than compared with the rise in 2014 to 2017.")
print("\n")
print("Lastly, looking at the United States, we can see that the data is not as straightforward as that of India and Brazil. We can see that from 2014 to some point in early-mid 2016 that
 females had more account ownership than males. However, from that same early-mid 2016 point up until somewhere late in 2017, males actually had more account ownership percentages than females.
  We can also see that from 2014 to 2017, male account ownership percentage was on the rise, whereas female account ownership percentages were declining. However, from 2017 all the way to the current  (latest) data, male account ownership percentage has been on the decline, and female account ownership percentage is on the rise.")
print("\n")
print("To conclude, we can see that, in general, when considering account percentage ownership, it has been on the rise and there is a greater percentage ownership now (in the current data) when
 compared to that of 2014. Also, except for the United States, it appears that historically males have had greater account ownership percentages than females based upon our data of Brazil and India.")
 
 
# Part 3: (Cherry Picking) Is it possible to say that account percentages have been increasing if you disregard a certain year? Does this change if you focus on a particular country and gender combinations?

print("\n")
print("For the first part of the question where is asks if we can conclude that account percentages have been increasing if we disregard a certain year, this is not the case. We would need to
disregard multiple years for this to happen. For example, looking at the United State's data, we see that the male account ownership percentage has been decreasing from 2017 all the way to the
current (latest) data which is past 2020. The same can been seen in the data for India's male data, where percentage ownernship has been decreasing from 2017 to the current [latest] data as
 well (past 2020).")
 print("\n")
 print("Technically yes, if we don't consider the data for United States male, United States female (decreasing from 2014-2017), India male, then it appears the remaining data
  (India female, Brazil male, and Brazil female) indicates that account ownership percentages have been increasing overall. Or if we just want to focous on a particular country, then Brazil
   will achieve that same result of an overall increasing percentage ownership trend. We can't do this with a gender combination (i.e. all males or females in our data set) because the data 
   varies in each country.")
  
  
  #Part 4: (Summarzing the data yourself) Do a similar analysis using the csv files:
# 1: Account ownership at a financial institution or with a mobile-money-service provider, primary education or less (% of population ages 15+)
# 2: Account ownership at a financial institution or with a mobile-money-service provider, secondary education or more (% of population ages 15+)

# You should analyze exactly five countries. You are free to pick the countries but your choices should not include the United States, India nor Brazil. After a visual representation of the data is made, you should have a paragraph or two summarizing your conclusions. The summary should be about the length of half a page and no longer than a page.

install.packages("tidyverse")

setwd("/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators-5")

primary <- read.csv("primary.csv")

# Determine that R does consider primary to be data frame
is.data.frame(primary)

head(primary)

library(tidyverse)

primary <- primary %>%
  filter(Country.Name == "Canada" | Country.Name == "China" | Country.Name == "United Kingdom" | Country.Name == "Germany" | Country.Name == "Chile")

head(primary)

colnames(primary)

rel_col <- which(colnames(primary)=="Country.Name" | colnames(primary)== "X2014..YR2014." | colnames(primary)== "X2017..YR2017." | colnames(primary) == "X2021..YR2021." )
rel_col

primary <- primary[rel_col] %>% rename("2014" = 2, "2017" = 3, "2021" = 4)
head(primary)

transpose_p <- data.frame(t(primary[-1]))
colnames(transpose_p) <- primary[, 1]
head(transpose_p)

print(sapply(transpose_p, class))

transpose_p$Canada = as.numeric(transpose_p$Canada) 
transpose_p$China = as.numeric(transpose_p$China)
transpose_p$"United Kingdom" = as.numeric(transpose_p$"United Kingdom")
transpose_p$"Germany" = as.numeric(transpose_p$"Germany")
transpose_p$"Chile" = as.numeric(transpose_p$"Chile")
head(transpose_p)

summarise_each(transpose_p, list(mean))

transpose_p %>%
  summarise(across(c(Canada, China, "United Kingdom", Germany, Chile), list(mean=mean, sd=sd)))

year <- c(2014, 2017,2021)
ggplot(data=transpose_p, aes(x=year, y=China, group=1)) +
  geom_line()+
  geom_point()

setwd("/Users/faizshaikh/Downloads/P_Data_Extract_From_World_Development_Indicators-6")

secondary <- read.csv(file = "secondary.csv")
secondary <- secondary %>%
  filter(Country.Name == "Canada" | Country.Name == "China" | Country.Name == "United Kingdom" | Country.Name == "Germany" | Country.Name == "Chile") 

secondary <- secondary[rel_col] %>% 
  rename("2014" = 2, "2017" = 3, "2021" = 4)

transpose_s <- data.frame(t(secondary[-1]))
colnames(transpose_s) <- secondary[, 1]

transpose_s$"Canada" = as.numeric(as.character(transpose_s$"Canada")) 
transpose_s$"China" = as.numeric(as.character(transpose_s$"China"))
transpose_s$"United Kingdom" = as.numeric(as.character(transpose_s$"United Kingdom"))
transpose_s$"Germany" = as.numeric(as.character(transpose_s$"Germany"))
transpose_s$"Chile" = as.numeric(as.character(transpose_s$"Chile"))

head(transpose_s)

transpose_s <- rename(transpose_s, "Canada_s" = 1, "China_s" = 2, "United_Kingdom_s" = 3, "Germany_s" = 4, "Chile_s" = 5)

head(transpose_s)

transpose_s <- rownames_to_column(transpose_s, var="Year") 
transpose_p <- rownames_to_column(transpose_p, var="Year")

acct_owner_by_education <- merge(x = transpose_s, y = transpose_p, by = "Year", all.x = TRUE)
acct_owner_by_education <- rename(acct_owner_by_education, "United_Kingdom" = 7)
head(acct_owner_by_education)

ggplot(data = transpose_p, aes(x = year)) +
  geom_line(aes(y = China, color = "China")) +
  geom_line(aes(y = Canada, color = "Canada")) +
  geom_line(aes(y = `United Kingdom`, color = "United Kingdom")) +
  geom_line(aes(y = Germany, color = "Germany")) +
  geom_line(aes(y = Chile, color = "Chile")) +
  geom_line(data = transpose_s, aes(y = China_s, color = "China_s")) +
  geom_line(data = transpose_s, aes(y = Canada_s, color = "Canada_s")) +
  geom_line(data = transpose_s, aes(y = United_Kingdom_s, color = "United Kingdom_s")) +
  geom_line(data = transpose_s, aes(y = Germany_s, color = "Germany_s")) +
  geom_line(data = transpose_s, aes(y = Chile_s, color = "Chile_s"))
  
  ylab("Percentage Ownership")

    
gfg_plot

![ggplot](/Users/faizshaikh/Desktop/ggplot.png)

print("Now that we are using five countries, the data (and plot) certainly is more complicated. We must press the "zoom" button in order to
 fully see our plot in its appropriate scale since the countries vary drastically in their data in percentage ownership.")
 print("\n")
 print("We can once again dissect our plot country by country to truly understand it.")