This HTML document is a report of the completion of 4th week’s assignment. It summarizes the packages I have used, displays the code along with the description of the data and its analysis in the graphical formats.
Following packages were used to perform the exercise:
library(ggplot2) #To produce graphs in R
library(gapminder) #To retrieve the data
library(tidyverse) #set of packages which share common data representations and 'API' design
The gapminder_unfiltered, which is our analysis data consists of 3313 rows and the following 6 columns. 1. Country: (Name of the country) 2. Continent: (continent where the country lies) 3. Year: (Values from 1952 to 2007) 4. lifeExp: (Life expectancy at birth, in years) 5. pop: (total population of the country) 6. gdpPercap: (GDP per capita)
#Understanding data
gapminder_unfiltered
## # A tibble: 3,313 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414
## # ... with 3,303 more rows
#creating an explicit copy
mygap <- gapminder_unfiltered
#Checking structure
str(mygap)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3313 obs. of 6 variables:
## $ country : Factor w/ 187 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 6 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
#Checking for total rows and variables
dim(mygap)
## [1] 3313 6
#Checking top and bottom values
head(mygap)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134
tail(mygap)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Zimbabwe Africa 1982 60.363 7636524 788.8550
## 2 Zimbabwe Africa 1987 62.351 9216418 706.1573
## 3 Zimbabwe Africa 1992 60.377 10704340 693.4208
## 4 Zimbabwe Africa 1997 46.809 11404948 792.4500
## 5 Zimbabwe Africa 2002 39.989 11926563 672.0386
## 6 Zimbabwe Africa 2007 43.487 12311143 469.7093
#Counting NA values if any
sum(mygap$gdpPercap[mygap$gdpPercap==-99])
## [1] 0
sum(mygap$pop[mygap$pop==-99])
## [1] 0
sum(mygap$lifeExp[mygap$lifeExp==-99])
## [1] 0
#Displaying summary statistics
summary(mygap)
## country continent year lifeExp
## Czech Republic: 58 Africa : 637 Min. :1950 Min. :23.60
## Denmark : 58 Americas: 470 1st Qu.:1967 1st Qu.:58.33
## Finland : 58 Asia : 578 Median :1982 Median :69.61
## Iceland : 58 Europe :1302 Mean :1980 Mean :65.24
## Japan : 58 FSU : 139 3rd Qu.:1996 3rd Qu.:73.66
## Netherlands : 58 Oceania : 187 Max. :2007 Max. :82.67
## (Other) :2965
## pop gdpPercap
## Min. :5.941e+04 Min. : 241.2
## 1st Qu.:2.680e+06 1st Qu.: 2505.3
## Median :7.560e+06 Median : 7825.8
## Mean :3.177e+07 Mean : 11313.8
## 3rd Qu.:1.961e+07 3rd Qu.: 17355.8
## Max. :1.319e+09 Max. :113523.1
##
Result1 <- mygap %>% filter(year==2007) %>% arrange(desc(gdpPercap))
ggplot(data = Result1) +
geom_point(mapping = aes(x = country, y = gdpPercap), color = "green") +
ggtitle("Distribution of GDP per capita across all countries | 2007") +
ylab("GDP per capita")+ xlab("Country")
Result1
## # A tibble: 183 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Qatar Asia 2007 75.588 907229 82010.98
## 2 Macao, China Asia 2007 80.718 456989 54589.82
## 3 Norway Europe 2007 80.196 4627926 49357.19
## 4 Brunei Asia 2007 77.118 386511 48014.59
## 5 Kuwait Asia 2007 77.588 2505559 47306.99
## 6 Singapore Asia 2007 79.972 4553009 47143.18
## 7 United States Americas 2007 78.242 301139947 42951.65
## 8 Ireland Europe 2007 78.885 4109086 40676.00
## 9 Hong Kong, China Asia 2007 82.208 6980412 39724.98
## 10 Switzerland Europe 2007 81.701 7554661 37506.42
## # ... with 173 more rows
ggplot(data = Result1)+
geom_boxplot(mapping = aes(x = reorder(continent, gdpPercap, FUN = median), y = gdpPercap, color = continent)) +
ggtitle("Difference in distributions across the different continents | 2007") +
ylab("Continent") + xlab("GDP per capita")
Result3 <- mygap %>% filter (year==2007) %>%
select(country, gdpPercap, year) %>%
arrange(desc(gdpPercap)) %>% head(10)
ggplot(Result3) +
geom_bar(mapping = aes(x = reorder(factor(country), desc(gdpPercap)), y = gdpPercap, fill=country), stat = "identity") +
ggtitle("Top 10 countries with the largest GDP per capita | 2007") +
ylab("GDP per Capita") + xlab("Countries")
Result3
## # A tibble: 10 × 3
## country gdpPercap year
## <fctr> <dbl> <int>
## 1 Qatar 82010.98 2007
## 2 Macao, China 54589.82 2007
## 3 Norway 49357.19 2007
## 4 Brunei 48014.59 2007
## 5 Kuwait 47306.99 2007
## 6 Singapore 47143.18 2007
## 7 United States 42951.65 2007
## 8 Ireland 40676.00 2007
## 9 Hong Kong, China 39724.98 2007
## 10 Switzerland 37506.42 2007
Result4 <- mygap %>% filter(country=='India') %>% select(gdpPercap, year)
ggplot(data = Result4) +
geom_bar(mapping = aes(x = year, y = gdpPercap, fill=year), stat = "identity", color = "blue") + ggtitle("GDP per capita for India for all years ") +
ylab("GDP per capita") + xlab("Year")
Result4
## # A tibble: 12 × 2
## gdpPercap year
## <dbl> <int>
## 1 546.5657 1952
## 2 590.0620 1957
## 3 658.3472 1962
## 4 700.7706 1967
## 5 724.0325 1972
## 6 813.3373 1977
## 7 855.7235 1982
## 8 976.5127 1987
## 9 1164.4068 1992
## 10 1458.8174 1997
## 11 1746.7695 2002
## 12 2452.2104 2007
IGDP2002<-select(filter(mygap, country == 'India' & (year ==2002)),gdpPercap)
IGDP2007<-select(filter(mygap, country == 'India' & (year ==2007)),gdpPercap)
Result5<-(IGDP2007-IGDP2002)*100/IGDP2002
Result5
## gdpPercap
## 1 40.38546
Result6<- mygap %>%
group_by(country)%>%
filter(country=="India")%>%
mutate(percent=100*((gdpPercap-lag(gdpPercap))/lag(gdpPercap)))
ggplot(data=Result6)+
geom_smooth(mapping= aes(x=year,y=gdpPercap), color = "red") +
geom_point(mapping= aes(x=year,y=gdpPercap), color = "yellow") +
ggtitle("Historical growth (or decline) in GDP per capita for India") +
ylab("GDP per capita") +xlab("year")
Result6
## Source: local data frame [12 x 7]
## Groups: country [1]
##
## country continent year lifeExp pop gdpPercap percent
## <fctr> <fctr> <int> <dbl> <int> <dbl> <dbl>
## 1 India Asia 1952 37.373 372000000 546.5657 NA
## 2 India Asia 1957 40.249 409000000 590.0620 7.958100
## 3 India Asia 1962 43.605 454000000 658.3472 11.572539
## 4 India Asia 1967 47.193 506000000 700.7706 6.443935
## 5 India Asia 1972 50.651 567000000 724.0325 3.319477
## 6 India Asia 1977 54.208 634000000 813.3373 12.334362
## 7 India Asia 1982 56.596 708000000 855.7235 5.211394
## 8 India Asia 1987 58.553 788000000 976.5127 14.115439
## 9 India Asia 1992 60.223 872000000 1164.4068 19.241341
## 10 India Asia 1997 61.765 959000000 1458.8174 25.284173
## 11 India Asia 2002 62.879 1034172547 1746.7695 19.738728
## 12 India Asia 2007 64.698 1110396331 2452.2104 40.385464