Use gapminder package and data set to answer the following
questions. Please write your commands under each question.
1. Get the data for 2002. Assign a name to that data.
data_2002 <- gapminder %>%
filter(year == 2002)
data_2002
## # A tibble: 142 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2002 42.1 25268405 727.
## 2 Albania Europe 2002 75.7 3508512 4604.
## 3 Algeria Africa 2002 71.0 31287142 5288.
## 4 Angola Africa 2002 41.0 10866106 2773.
## 5 Argentina Americas 2002 74.3 38331121 8798.
## 6 Australia Oceania 2002 80.4 19546792 30688.
## 7 Austria Europe 2002 79.0 8148312 32418.
## 8 Bahrain Asia 2002 74.8 656397 23404.
## 9 Bangladesh Asia 2002 62.0 135656790 1136.
## 10 Belgium Europe 2002 78.3 10311970 30486.
## # ℹ 132 more rows
2. Get the data for Germany in 2002.
germany_2002 <- gapminder %>%
filter(year == 2002, country == "Germany")
germany_2002
## # A tibble: 1 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Germany Europe 2002 78.7 82350671 30036.
3. Find which country has the lowest lifeExp overall.
lowest_lifeExp_overall <- gapminder %>%
filter(lifeExp == min(lifeExp)) %>%
select(country, lifeExp)
lowest_lifeExp_overall
## # A tibble: 1 × 2
## country lifeExp
## <fct> <dbl>
## 1 Rwanda 23.6
4. Find which country has the lowest lifeExp in 2002.
lowest_lifeExp_2002 <- gapminder %>%
filter(year == 2002) %>%
filter(lifeExp == min(lifeExp)) %>%
select(country, lifeExp)
lowest_lifeExp_2002
## # A tibble: 1 × 2
## country lifeExp
## <fct> <dbl>
## 1 Zambia 39.2
5. Find the lifeExp in Germany in 2002.
lifeExp_germany_2002 <- gapminder %>%
filter(year == 2002, country == "Germany") %>%
select(country,lifeExp)
lifeExp_germany_2002
## # A tibble: 1 × 2
## country lifeExp
## <fct> <dbl>
## 1 Germany 78.7
6. Find the countries whose lifeExp is higher than 80 in 2002.
high_lifeExp_countries_2002 <- gapminder %>%
filter(year == 2002, lifeExp > 80) %>%
select(country, lifeExp)
high_lifeExp_countries_2002
## # A tibble: 7 × 2
## country lifeExp
## <fct> <dbl>
## 1 Australia 80.4
## 2 Hong Kong, China 81.5
## 3 Iceland 80.5
## 4 Italy 80.2
## 5 Japan 82
## 6 Sweden 80.0
## 7 Switzerland 80.6
7. Find the countries whose lifeExp is more than 70 and less than
80
mid_lifeExp_countries <- gapminder %>%
filter(lifeExp > 70 & lifeExp < 80) %>%
select(country, lifeExp)
mid_lifeExp_countries
## # A tibble: 471 × 2
## country lifeExp
## <fct> <dbl>
## 1 Albania 70.4
## 2 Albania 72
## 3 Albania 71.6
## 4 Albania 73.0
## 5 Albania 75.7
## 6 Albania 76.4
## 7 Algeria 71.0
## 8 Algeria 72.3
## 9 Argentina 70.8
## 10 Argentina 71.9
## # ℹ 461 more rows
8. Find the lifeExp in Europe across the years. Which year is the
highest lifeExp in Europe?
europe_lifeExp <- gapminder %>%
filter(continent == "Europe") %>%
group_by(year) %>%
summarize(avg_lifeExp = mean(lifeExp))
highest_lifeExp_year <- europe_lifeExp %>%
filter(avg_lifeExp == max(avg_lifeExp))
highest_lifeExp_year
## # A tibble: 1 × 2
## year avg_lifeExp
## <int> <dbl>
## 1 2007 77.6
9. Define gdp as it is equal to to gdpPercap * pop/10000 . Find the
gdp of Europe in 2002.
data_2002 <- data_2002 %>%
mutate(gdp = gdpPercap * pop / 10000)
europe_gdp_2002 <- data_2002 %>%
filter(continent == "Europe") %>%
summarize(total_gdp = sum(gdp))
europe_gdp_2002
## # A tibble: 1 × 1
## total_gdp
## <dbl>
## 1 1309346445.
10. Which country has the highest gdp in Europe in 2002 ?
highest_gdp_country <- data_2002 %>%
filter(continent == "Europe") %>%
filter(gdp == max(gdp)) %>%
select(country, gdp)
highest_gdp_country
## # A tibble: 1 × 2
## country gdp
## <fct> <dbl>
## 1 Germany 247346845.
11. Save the data in 2002 in Europe. Call it data_2002.
data_2002 <- gapminder %>%
filter(year == 2002, continent == "Europe")
data_2002
## # A tibble: 30 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Albania Europe 2002 75.7 3508512 4604.
## 2 Austria Europe 2002 79.0 8148312 32418.
## 3 Belgium Europe 2002 78.3 10311970 30486.
## 4 Bosnia and Herzegovina Europe 2002 74.1 4165416 6019.
## 5 Bulgaria Europe 2002 72.1 7661799 7697.
## 6 Croatia Europe 2002 74.9 4481020 11628.
## 7 Czech Republic Europe 2002 75.5 10256295 17596.
## 8 Denmark Europe 2002 77.2 5374693 32167.
## 9 Finland Europe 2002 78.4 5193039 28205.
## 10 France Europe 2002 79.6 59925035 28926.
## # ℹ 20 more rows
12. Use data_2002. Use ggplot. Plot gdpPercap vs lifeExp.
ggplot(data_2002, aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
labs(title = "GDP per Capita vs Life Expectancy in 2002",
x = "GDP per Capita",
y = "Life Expectancy")

13. Use data_2002. Use ggplot. Plot gdpPercap vs lifeExp by
continent (color)
ggplot(data_2002, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
labs(title = "GDP per Capita vs Life Expectancy in 2002 by Continent",
x = "GDP per Capita",
y = "Life Expectancy")

14. Use data_2002. Use ggplot. Plot gdpPercap vs lifeExp by
continent and pop (color and size)
ggplot(data_2002, aes(x = gdpPercap, y = lifeExp, color = continent, size = pop)) +
geom_point() +
labs(title = "GDP per Capita vs Life Expectancy in 2002 by Continent and Population",
x = "GDP per Capita",
y = "Life Expectancy")

15. Get data for Europe in 2002. Call it data_Europe
data_Europe <- gapminder %>%
filter(year == 2002, continent == "Europe")
data_Europe
## # A tibble: 30 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Albania Europe 2002 75.7 3508512 4604.
## 2 Austria Europe 2002 79.0 8148312 32418.
## 3 Belgium Europe 2002 78.3 10311970 30486.
## 4 Bosnia and Herzegovina Europe 2002 74.1 4165416 6019.
## 5 Bulgaria Europe 2002 72.1 7661799 7697.
## 6 Croatia Europe 2002 74.9 4481020 11628.
## 7 Czech Republic Europe 2002 75.5 10256295 17596.
## 8 Denmark Europe 2002 77.2 5374693 32167.
## 9 Finland Europe 2002 78.4 5193039 28205.
## 10 France Europe 2002 79.6 59925035 28926.
## # ℹ 20 more rows
16. Use data_Europe. Use ggplot. Plot pop vs gdpPercap.
ggplot(data_Europe, aes(x = pop, y = gdpPercap)) +
geom_point() +
labs(title = "Population vs GDP per Capita in Europe (2002)",
x = "Population",
y = "GDP per Capita")

17. Use data_Europe. Use ggplot. Plot pop vs gdpPercap. Scale
population by log10
ggplot(data_Europe, aes(x = log10(pop), y = gdpPercap)) +
geom_point() +
labs(title = "Log(Population) vs GDP per Capita in Europe (2002)",
x = "Log(Population)",
y = "GDP per Capita")

18. Use data_Europe. Use ggplot. Plot pop vs gdpPercap. Scale
population by log10. Color the data by country.
ggplot(data_Europe, aes(x = log10(pop), y = gdpPercap, color = country)) +
geom_point() +
labs(title = "Log(Population) vs GDP per Capita in Europe (2002) by Country",
x = "Log(Population)",
y = "GDP per Capita")

19. Use data_Europe. Use ggplot. Plot pop vs gdpPercap. Scale
population by log10. Color the data by country and size it by
lifeExp.
ggplot(data_Europe, aes(x = log10(pop), y = gdpPercap, color = country, size = lifeExp)) +
geom_point() +
labs(title = "Log(Population) vs GDP per Capita in Europe (2002) by Country and Life Expectancy",
x = "Log(Population)",
y = "GDP per Capita")

20. See the attached file in excel, namely,tourism.xls. Create a
folder and give a name FORECASTING.
1) Save the tourism excel file in that FORECASTING directory.
2) Set your working directory as FORECASTING
3) Import tourism excel file into R-studio.
4) Assign a different name to this data, such as “mydata”
5) Check the structure of your dataset by str() function. Change
Region column from character to factor. Use as.factor() function.
options(digits = 3, scipen = 9999, stringasFactors = FALSE)
# make sure characters are not factors. The 1st column, Quarter, needs to be NOT factor.
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.2
setwd("C://FORECASTING")
mydata <- read_excel("C://FORECASTING//tourism.xlsx")
str(mydata)
## tibble [24,320 × 5] (S3: tbl_df/tbl/data.frame)
## $ Quarter: chr [1:24320] "1998-01-01" "1998-04-01" "1998-07-01" "1998-10-01" ...
## $ Region : chr [1:24320] "Adelaide" "Adelaide" "Adelaide" "Adelaide" ...
## $ State : chr [1:24320] "South Australia" "South Australia" "South Australia" "South Australia" ...
## $ Purpose: chr [1:24320] "Business" "Business" "Business" "Business" ...
## $ Trips : num [1:24320] 135 110 166 127 137 ...
mydata$Region <- as.factor(mydata$Region)
str(mydata)
## tibble [24,320 × 5] (S3: tbl_df/tbl/data.frame)
## $ Quarter: chr [1:24320] "1998-01-01" "1998-04-01" "1998-07-01" "1998-10-01" ...
## $ Region : Factor w/ 76 levels "Adelaide","Adelaide Hills",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ State : chr [1:24320] "South Australia" "South Australia" "South Australia" "South Australia" ...
## $ Purpose: chr [1:24320] "Business" "Business" "Business" "Business" ...
## $ Trips : num [1:24320] 135 110 166 127 137 ...