Loading library

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded

Dataset 01

This dataset, retrieved from the Bureau of Labor Statistics, shows the median weekly incomes for 535 different occupations. The data encompasses information for all working American citizens as of January 2015. The incomes are broken into male and female statistics, preceded by the total median income when including both genders. ####Importing data

df = read_csv("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT02/master/inc_occ_gender.csv")
## Parsed with column specification:
## cols(
##   Occupation = col_character(),
##   All_workers = col_integer(),
##   All_weekly = col_character(),
##   M_workers = col_integer(),
##   M_weekly = col_character(),
##   F_workers = col_integer(),
##   F_weekly = col_character()
## )
df
## # A tibble: 558 x 7
##    Occupation All_workers All_weekly M_workers M_weekly F_workers F_weekly
##    <chr>            <int> <chr>          <int> <chr>        <int> <chr>   
##  1 ALL OCCUP…      109080 809            60746 895          48334 726     
##  2 MANAGEMENT       12480 1351            7332 1486          5147 1139    
##  3 Chief exe…        1046 2041             763 2251           283 1836    
##  4 General a…         823 1260             621 1347           202 1002    
##  5 Legislato…           8 Na                 5 Na               4 Na      
##  6 Advertisi…          55 1050              29 Na              26 Na      
##  7 Marketing…         948 1462             570 1603           378 1258    
##  8 Public re…          59 1557              24 Na              35 Na      
##  9 Administr…         170 1191              96 1451            73 981     
## 10 Computer …         636 1728             466 1817           169 1563    
## # ... with 548 more rows

Data definition

Median weekly earnings of full-time wage and salary workers by detailed occupation and sex.

Occupation: Job title as given from BLS. Industry summaries are given in ALL CAPS.
All_workers: Number of workers male and female, in thousands.
All_weekly: Median weekly income including male and female workers, in USD.
M_workers: Number of male workers, in thousands.
M_weekly: Median weekly income for male workers, in USD.
F_workers: Number of female workers, in thousands.
F_weekly: Median weekly income for female workers, in USD.

Collapsing columns into key-value pair. Collapsing based on gender division as gender, workers and their weekly median income as class.

df %>% 
    gather(key, value, 2:7) %>% 
    separate(key, into=c("gender", "class"), sep="_") -> dt

Transforming data and delation of missing value.

dt %>% mutate(value=as.numeric(value)) %>% na.omit() -> dt
## Warning in evalq(as.numeric(value), <environment>): NAs introduced by
## coercion
dt
## # A tibble: 2,420 x 4
##    Occupation                                gender class       value
##    <chr>                                     <chr>  <chr>       <dbl>
##  1 ALL OCCUPATIONS                           All    workers 109080   
##  2 MANAGEMENT                                All    workers  12480   
##  3 Chief executives                          All    workers   1046   
##  4 General and operations managers           All    workers    823   
##  5 Legislators                               All    workers      8.00
##  6 Advertising and promotions managers       All    workers     55.0 
##  7 Marketing and sales managers              All    workers    948   
##  8 Public relations and fundraising managers All    workers     59.0 
##  9 Administrative services managers          All    workers    170   
## 10 Computer and information systems managers All    workers    636   
## # ... with 2,410 more rows

Weekly income histogram for male, female and together.

dt %>% 
    filter(class=='weekly') %>%
    ggplot(aes(x=value)) + 
    geom_histogram(bins=50) + 
    facet_grid(gender ~ .)

Median income for male, female and together from tidy data dt.

dt %>% 
    filter(class == 'weekly') %>% 
    group_by(gender) %>% 
    summarize(median(value))
## # A tibble: 3 x 2
##   gender `median(value)`
##   <chr>            <dbl>
## 1 All                856
## 2 F                  736
## 3 M                  916

Verifying the median calculation from initial data frame df.

#Median income for male
median(na.omit(as.numeric(df$M_weekly)))
## Warning in na.omit(as.numeric(df$M_weekly)): NAs introduced by coercion
## [1] 915.5
#Median income for female
median(na.omit(as.numeric(df$F_weekly)))
## Warning in na.omit(as.numeric(df$F_weekly)): NAs introduced by coercion
## [1] 736

Dataset 02

This data is from Environmental Protection Administration, Executive Yuan, R.O.C. (Taiwan). There is air quality data and meteorological monitoring data for research and analysis (only include northern Taiwan 2015).

Importing data

df <- read.csv("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT02/master/2015y_Banqiao_Station.csv", stringsAsFactors = FALSE)
str(df)
## 'data.frame':    7665 obs. of  27 variables:
##  $ date   : chr  "2015/01/01" "2015/01/01" "2015/01/01" "2015/01/01" ...
##  $ station: chr  "Banqiao" "Banqiao" "Banqiao" "Banqiao" ...
##  $ item   : chr  "AMB_TEMP" "CH4" "CO" "NMHC" ...
##  $ X00    : chr  "16" "2.1" "0.79" "0.14" ...
##  $ X01    : chr  "16" "2.1" "0.8" "0.15" ...
##  $ X02    : chr  "16" "2.1" "0.71" "0.13" ...
##  $ X03    : chr  "15" "2" "0.66" "0.12" ...
##  $ X04    : chr  "15" "2" "0.53" "0.11" ...
##  $ X05    : chr  "14" "2" "0.5" "0.11" ...
##  $ X06    : chr  "14" "2" "0.57" "0.14" ...
##  $ X07    : chr  "14" "2" "0.61" "0.14" ...
##  $ X08    : chr  "14" "2" "0.59" "0.13" ...
##  $ X09    : chr  "14" "2" "0.6" "0.14" ...
##  $ X10    : chr  "15" "2" "0.55" "0.13" ...
##  $ X11    : chr  "14" "2" "0.51" "0.11" ...
##  $ X12    : chr  "15" "2" "0.47" "0.11" ...
##  $ X13    : chr  "14" "2" "0.42" "0.09" ...
##  $ X14    : chr  "14" "2" "0.41" "0.1" ...
##  $ X15    : chr  "13" "2" "0.42" "0.1" ...
##  $ X16    : chr  "13" "2" "0.45" "0.11" ...
##  $ X17    : chr  "13" "2" "0.46" "0.12" ...
##  $ X18    : chr  "13" "2" "0.45" "0.13" ...
##  $ X19    : chr  "13" "2" "0.48" "0.15" ...
##  $ X20    : chr  "13" "2" "0.52" "0.16" ...
##  $ X21    : chr  "13" "2" "0.43" "0.13" ...
##  $ X22    : chr  "13" "2" "0.4" "0.12" ...
##  $ X23    : chr  "13" "2" "0.35" "0.11" ...

Data definition

Data definition can be found from here.

Data preparation

I will do some data formatting. Date is currently in character, but I want to have it as type “Date”. All values of observations - columns 4 to 27 - should be numeric.

df$date <- as.Date(df$date, format = "%Y/%m/%d")
df$station <- as.factor(df$station)
df$item <- as.factor(df$item)

for(i in 4:27){
        df[,i] <- suppressWarnings(as.numeric(df[,i]))
}
summary(df[,4:27])
##       X00              X01              X02              X03        
##  Min.   : -0.30   Min.   : -0.30   Min.   : -0.40   Min.   : -0.30  
##  1st Qu.:  1.30   1st Qu.:  1.20   1st Qu.:  1.20   1st Qu.:  1.10  
##  Median :  5.20   Median :  5.05   Median :  4.80   Median :  4.70  
##  Mean   : 28.89   Mean   : 29.16   Mean   : 28.89   Mean   : 29.15  
##  3rd Qu.: 32.00   3rd Qu.: 31.00   3rd Qu.: 31.00   3rd Qu.: 30.00  
##  Max.   :360.00   Max.   :359.00   Max.   :359.00   Max.   :354.00  
##  NA's   :1065     NA's   :1055     NA's   :1081     NA's   :1048    
##       X04              X05              X06              X07        
##  Min.   : -0.20   Min.   : -0.10   Min.   : -0.20   Min.   :  0.00  
##  1st Qu.:  1.11   1st Qu.:  1.20   1st Qu.:  1.40   1st Qu.:  1.90  
##  Median :  4.90   Median :  5.10   Median :  6.40   Median :  8.50  
##  Mean   : 29.53   Mean   : 29.15   Mean   : 29.73   Mean   : 30.64  
##  3rd Qu.: 29.00   3rd Qu.: 29.00   3rd Qu.: 30.00   3rd Qu.: 32.00  
##  Max.   :357.00   Max.   :358.00   Max.   :386.00   Max.   :355.00  
##  NA's   :1050     NA's   :1040     NA's   :1036     NA's   :1032    
##       X08              X09              X10              X11        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   : -0.10  
##  1st Qu.:  2.00   1st Qu.:  2.00   1st Qu.:  2.00   1st Qu.:  2.00  
##  Median :  9.40   Median :  9.40   Median :  9.00   Median :  9.30  
##  Mean   : 31.18   Mean   : 31.87   Mean   : 33.14   Mean   : 33.89  
##  3rd Qu.: 34.50   3rd Qu.: 36.00   3rd Qu.: 36.00   3rd Qu.: 36.00  
##  Max.   :359.00   Max.   :360.00   Max.   :359.00   Max.   :360.00  
##  NA's   :1050     NA's   :1083     NA's   :1267     NA's   :1284    
##       X12              X13              X14              X15        
##  Min.   : -0.20   Min.   : -0.10   Min.   : -0.40   Min.   : -0.20  
##  1st Qu.:  2.00   1st Qu.:  2.00   1st Qu.:  2.00   1st Qu.:  1.90  
##  Median :  8.30   Median :  7.30   Median :  6.60   Median :  7.00  
##  Mean   : 33.05   Mean   : 32.78   Mean   : 32.32   Mean   : 31.62  
##  3rd Qu.: 35.00   3rd Qu.: 35.00   3rd Qu.: 34.00   3rd Qu.: 34.00  
##  Max.   :359.00   Max.   :359.00   Max.   :360.00   Max.   :360.00  
##  NA's   :1183     NA's   :1228     NA's   :1246     NA's   :1167    
##       X16              X17              X18              X19        
##  Min.   :  0.00   Min.   : -0.20   Min.   : -0.10   Min.   :  0.00  
##  1st Qu.:  1.90   1st Qu.:  1.90   1st Qu.:  1.90   1st Qu.:  1.90  
##  Median :  7.20   Median :  7.65   Median :  7.10   Median :  7.00  
##  Mean   : 30.29   Mean   : 29.90   Mean   : 29.32   Mean   : 28.92  
##  3rd Qu.: 34.00   3rd Qu.: 34.75   3rd Qu.: 35.00   3rd Qu.: 34.00  
##  Max.   :360.00   Max.   :359.00   Max.   :359.00   Max.   :356.00  
##  NA's   :1088     NA's   :1055     NA's   :1059     NA's   :1056    
##       X20              X21              X22              X23        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   : -0.20  
##  1st Qu.:  1.80   1st Qu.:  1.70   1st Qu.:  1.60   1st Qu.:  1.44  
##  Median :  6.80   Median :  6.00   Median :  6.00   Median :  5.60  
##  Mean   : 29.09   Mean   : 28.87   Mean   : 29.06   Mean   : 28.60  
##  3rd Qu.: 34.00   3rd Qu.: 33.00   3rd Qu.: 32.00   3rd Qu.: 32.00  
##  Max.   :358.00   Max.   :360.00   Max.   :358.00   Max.   :358.00  
##  NA's   :1050     NA's   :1028     NA's   :1056     NA's   :1048
str(df)
## 'data.frame':    7665 obs. of  27 variables:
##  $ date   : Date, format: "2015-01-01" "2015-01-01" ...
##  $ station: Factor w/ 1 level "Banqiao": 1 1 1 1 1 1 1 1 1 1 ...
##  $ item   : Factor w/ 21 levels "AMB_TEMP","CH4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ X00    : num  16 2.1 0.79 0.14 1.2 16 17 37 NA 177 ...
##  $ X01    : num  16 2.1 0.8 0.15 1.3 16 17 36 NA 178 ...
##  $ X02    : num  16 2.1 0.71 0.13 1 13 14 38 NA 163 ...
##  $ X03    : num  15 2 0.66 0.12 0.8 11 12 39 NA 147 ...
##  $ X04    : num  15 2 0.53 0.11 0.6 10 11 38 NA 131 ...
##  $ X05    : num  14 2 0.5 0.11 0.8 11 12 37 NA 112 ...
##  $ X06    : num  14 2 0.57 0.14 1.4 18 19 29 NA 103 ...
##  $ X07    : num  14 2 0.61 0.14 2.3 17 20 27 NA 104 ...
##  $ X08    : num  14 2 0.59 0.13 2.9 15 18 29 NA 111 ...
##  $ X09    : num  14 2 0.6 0.14 4.1 16 20 29 NA 111 ...
##  $ X10    : num  15 2 0.55 0.13 4.2 14 18 35 NA 100 ...
##  $ X11    : num  14 2 0.51 0.11 3.8 13 17 39 NA 92 ...
##  $ X12    : num  15 2 0.47 0.11 4.2 12 16 41 NA 83 ...
##  $ X13    : num  14 2 0.42 0.09 2.9 10 13 43 NA 76 ...
##  $ X14    : num  14 2 0.41 0.1 3 12 15 39 NA 73 ...
##  $ X15    : num  13 2 0.42 0.1 2.4 13 16 36 NA 66 ...
##  $ X16    : num  13 2 0.45 0.11 1.8 14 16 33 NA 61 ...
##  $ X17    : num  13 2 0.46 0.12 2 16 18 31 NA 57 ...
##  $ X18    : num  13 2 0.45 0.13 2.5 18 20 29 NA 50 ...
##  $ X19    : num  13 2 0.48 0.15 2.8 21 24 24 NA 45 ...
##  $ X20    : num  13 2 0.52 0.16 3.3 23 27 20 NA 45 ...
##  $ X21    : num  13 2 0.43 0.13 3.2 19 23 22 NA 43 ...
##  $ X22    : num  13 2 0.4 0.12 2.7 17 20 23 NA 40 ...
##  $ X23    : num  13 2 0.35 0.11 1.6 14 15 26 NA 42 ...

Tidy data

The Hours are currently in columns, but these are observations (–> Rows!).
The column “item” contains the variables per row –> Columns

I want to have the hours in rows. So I need a new column for ‘Hour’ (-> key = “Hour”) and the corresponding ‘Value’, currently spread of the column 4:27.

gatherDf <- gather(df, key = "Hour", value = "Value", 4:27 )
str(gatherDf)
## 'data.frame':    183960 obs. of  5 variables:
##  $ date   : Date, format: "2015-01-01" "2015-01-01" ...
##  $ station: Factor w/ 1 level "Banqiao": 1 1 1 1 1 1 1 1 1 1 ...
##  $ item   : Factor w/ 21 levels "AMB_TEMP","CH4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Hour   : chr  "X00" "X00" "X00" "X00" ...
##  $ Value  : num  16 2.1 0.79 0.14 1.2 16 17 37 NA 177 ...

But if we want to have the indicators spread over columns - no problem - just use spread()-function from tidyR.

spreadDf <- spread(gatherDf, key = item, value = Value) 

Let’s check the result for the dataset.

str(spreadDf)
## 'data.frame':    8760 obs. of  24 variables:
##  $ date      : Date, format: "2015-01-01" "2015-01-01" ...
##  $ station   : Factor w/ 1 level "Banqiao": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Hour      : chr  "X00" "X01" "X02" "X03" ...
##  $ AMB_TEMP  : num  16 16 16 15 15 14 14 14 14 14 ...
##  $ CH4       : num  2.1 2.1 2.1 2 2 2 2 2 2 2 ...
##  $ CO        : num  0.79 0.8 0.71 0.66 0.53 0.5 0.57 0.61 0.59 0.6 ...
##  $ NMHC      : num  0.14 0.15 0.13 0.12 0.11 0.11 0.14 0.14 0.13 0.14 ...
##  $ NO        : num  1.2 1.3 1 0.8 0.6 0.8 1.4 2.3 2.9 4.1 ...
##  $ NO2       : num  16 16 13 11 10 11 18 17 15 16 ...
##  $ NOx       : num  17 17 14 12 11 12 19 20 18 20 ...
##  $ O3        : num  37 36 38 39 38 37 29 27 29 29 ...
##  $ PH_RAIN   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PM10      : num  177 178 163 147 131 112 103 104 111 111 ...
##  $ PM2.5     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RAIN_COND : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RAINFALL  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ RH        : num  57 57 57 58 58 57 57 56 53 50 ...
##  $ SO2       : num  12 11 8 6.5 5.5 5.7 5.8 6.5 6.8 5.9 ...
##  $ THC       : num  2.2 2.2 2.2 2.2 2.1 2.1 2.1 2.2 2.1 2.1 ...
##  $ UVB       : num  0 0 0 0 0 0 0 0.2 0.7 1.6 ...
##  $ WD_HR     : num  69 67 63 63 69 68 74 66 71 73 ...
##  $ WIND_DIREC: num  69 65 53 63 67 62 70 72 73 68 ...
##  $ WIND_SPEED: num  4.7 4 3.7 4.1 3 2.9 3.6 3.1 2.8 3.6 ...
##  $ WS_HR     : num  4.2 4 3.5 3.3 3.1 3.3 2.2 3 3.3 3.2 ...

Dataset 03

Prevalence of HIV refers to the percentage of people ages 15-49 who are infected with HIV. Data source can be found from here.

Importing data

library(RCurl)
## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(tibble)
url <- getURL("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT02/master/indicator%20hiv%20estimated%20prevalence%25%2015-49.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)

Tidy data

colnames(df)[colnames(df) == "Estimated HIV Prevalence% - (Ages 15-49)"] <- "country"
hiv <- as.tibble(df) %>%
        gather(`1979`:`2011`, key = year, value = prec, na.rm = TRUE)

Find maximum HIV percentage for each country

hiv %>%
        group_by(country) %>%
        summarise(max_perc = max(prec, na.rm = TRUE))
## # A tibble: 154 x 2
##    country     max_perc
##    <fct>          <dbl>
##  1 Afghanistan   0.0600
##  2 Algeria       0.100 
##  3 Angola        2.10  
##  4 Argentina     0.500 
##  5 Armenia       0.200 
##  6 Australia     0.200 
##  7 Austria       0.400 
##  8 Azerbaijan    0.100 
##  9 Bahamas       3.90  
## 10 Bangladesh    0.0600
## # ... with 144 more rows

Find the country with max HIV prevalence percentage

hiv %>%
        group_by(country) %>%
        summarise(max_perc = max(prec, na.rm = TRUE)) %>%
        arrange(desc(max_perc))
## # A tibble: 154 x 2
##    country      max_perc
##    <fct>           <dbl>
##  1 Zimbabwe         26.5
##  2 Botswana         26.3
##  3 Swaziland        26.0
##  4 Lesotho          24.5
##  5 South Africa     18.1
##  6 Namibia          16.5
##  7 Zambia           15.0
##  8 Malawi           14.7
##  9 Mozambique       11.4
## 10 Uganda           10.7
## # ... with 144 more rows

Zimbabwe has the maximum HIV prevalence percentage followed by Botswana.

Scatter plot for HIV prevalence percentage:

hiv %>%
        ggplot(aes(x = year, y = prec, group = country)) +
        geom_jitter() +
        coord_flip()