library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
This dataset, retrieved from the Bureau of Labor Statistics, shows the median weekly incomes for 535 different occupations. The data encompasses information for all working American citizens as of January 2015. The incomes are broken into male and female statistics, preceded by the total median income when including both genders. ####Importing data
df = read_csv("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT02/master/inc_occ_gender.csv")
## Parsed with column specification:
## cols(
## Occupation = col_character(),
## All_workers = col_integer(),
## All_weekly = col_character(),
## M_workers = col_integer(),
## M_weekly = col_character(),
## F_workers = col_integer(),
## F_weekly = col_character()
## )
df
## # A tibble: 558 x 7
## Occupation All_workers All_weekly M_workers M_weekly F_workers F_weekly
## <chr> <int> <chr> <int> <chr> <int> <chr>
## 1 ALL OCCUP… 109080 809 60746 895 48334 726
## 2 MANAGEMENT 12480 1351 7332 1486 5147 1139
## 3 Chief exe… 1046 2041 763 2251 283 1836
## 4 General a… 823 1260 621 1347 202 1002
## 5 Legislato… 8 Na 5 Na 4 Na
## 6 Advertisi… 55 1050 29 Na 26 Na
## 7 Marketing… 948 1462 570 1603 378 1258
## 8 Public re… 59 1557 24 Na 35 Na
## 9 Administr… 170 1191 96 1451 73 981
## 10 Computer … 636 1728 466 1817 169 1563
## # ... with 548 more rows
Median weekly earnings of full-time wage and salary workers by detailed occupation and sex.
Occupation: Job title as given from BLS. Industry summaries are given in ALL CAPS. All_workers: Number of workers male and female, in thousands. All_weekly: Median weekly income including male and female workers, in USD. M_workers: Number of male workers, in thousands. M_weekly: Median weekly income for male workers, in USD. F_workers: Number of female workers, in thousands. F_weekly: Median weekly income for female workers, in USD.
df %>%
gather(key, value, 2:7) %>%
separate(key, into=c("gender", "class"), sep="_") -> dt
dt %>% mutate(value=as.numeric(value)) %>% na.omit() -> dt
## Warning in evalq(as.numeric(value), <environment>): NAs introduced by
## coercion
dt
## # A tibble: 2,420 x 4
## Occupation gender class value
## <chr> <chr> <chr> <dbl>
## 1 ALL OCCUPATIONS All workers 109080
## 2 MANAGEMENT All workers 12480
## 3 Chief executives All workers 1046
## 4 General and operations managers All workers 823
## 5 Legislators All workers 8.00
## 6 Advertising and promotions managers All workers 55.0
## 7 Marketing and sales managers All workers 948
## 8 Public relations and fundraising managers All workers 59.0
## 9 Administrative services managers All workers 170
## 10 Computer and information systems managers All workers 636
## # ... with 2,410 more rows
dt %>%
filter(class=='weekly') %>%
ggplot(aes(x=value)) +
geom_histogram(bins=50) +
facet_grid(gender ~ .)
dt %>%
filter(class == 'weekly') %>%
group_by(gender) %>%
summarize(median(value))
## # A tibble: 3 x 2
## gender `median(value)`
## <chr> <dbl>
## 1 All 856
## 2 F 736
## 3 M 916
#Median income for male
median(na.omit(as.numeric(df$M_weekly)))
## Warning in na.omit(as.numeric(df$M_weekly)): NAs introduced by coercion
## [1] 915.5
#Median income for female
median(na.omit(as.numeric(df$F_weekly)))
## Warning in na.omit(as.numeric(df$F_weekly)): NAs introduced by coercion
## [1] 736
This data is from Environmental Protection Administration, Executive Yuan, R.O.C. (Taiwan). There is air quality data and meteorological monitoring data for research and analysis (only include northern Taiwan 2015).
df <- read.csv("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT02/master/2015y_Banqiao_Station.csv", stringsAsFactors = FALSE)
str(df)
## 'data.frame': 7665 obs. of 27 variables:
## $ date : chr "2015/01/01" "2015/01/01" "2015/01/01" "2015/01/01" ...
## $ station: chr "Banqiao" "Banqiao" "Banqiao" "Banqiao" ...
## $ item : chr "AMB_TEMP" "CH4" "CO" "NMHC" ...
## $ X00 : chr "16" "2.1" "0.79" "0.14" ...
## $ X01 : chr "16" "2.1" "0.8" "0.15" ...
## $ X02 : chr "16" "2.1" "0.71" "0.13" ...
## $ X03 : chr "15" "2" "0.66" "0.12" ...
## $ X04 : chr "15" "2" "0.53" "0.11" ...
## $ X05 : chr "14" "2" "0.5" "0.11" ...
## $ X06 : chr "14" "2" "0.57" "0.14" ...
## $ X07 : chr "14" "2" "0.61" "0.14" ...
## $ X08 : chr "14" "2" "0.59" "0.13" ...
## $ X09 : chr "14" "2" "0.6" "0.14" ...
## $ X10 : chr "15" "2" "0.55" "0.13" ...
## $ X11 : chr "14" "2" "0.51" "0.11" ...
## $ X12 : chr "15" "2" "0.47" "0.11" ...
## $ X13 : chr "14" "2" "0.42" "0.09" ...
## $ X14 : chr "14" "2" "0.41" "0.1" ...
## $ X15 : chr "13" "2" "0.42" "0.1" ...
## $ X16 : chr "13" "2" "0.45" "0.11" ...
## $ X17 : chr "13" "2" "0.46" "0.12" ...
## $ X18 : chr "13" "2" "0.45" "0.13" ...
## $ X19 : chr "13" "2" "0.48" "0.15" ...
## $ X20 : chr "13" "2" "0.52" "0.16" ...
## $ X21 : chr "13" "2" "0.43" "0.13" ...
## $ X22 : chr "13" "2" "0.4" "0.12" ...
## $ X23 : chr "13" "2" "0.35" "0.11" ...
Data definition can be found from here.
I will do some data formatting. Date is currently in character, but I want to have it as type “Date”. All values of observations - columns 4 to 27 - should be numeric.
df$date <- as.Date(df$date, format = "%Y/%m/%d")
df$station <- as.factor(df$station)
df$item <- as.factor(df$item)
for(i in 4:27){
df[,i] <- suppressWarnings(as.numeric(df[,i]))
}
summary(df[,4:27])
## X00 X01 X02 X03
## Min. : -0.30 Min. : -0.30 Min. : -0.40 Min. : -0.30
## 1st Qu.: 1.30 1st Qu.: 1.20 1st Qu.: 1.20 1st Qu.: 1.10
## Median : 5.20 Median : 5.05 Median : 4.80 Median : 4.70
## Mean : 28.89 Mean : 29.16 Mean : 28.89 Mean : 29.15
## 3rd Qu.: 32.00 3rd Qu.: 31.00 3rd Qu.: 31.00 3rd Qu.: 30.00
## Max. :360.00 Max. :359.00 Max. :359.00 Max. :354.00
## NA's :1065 NA's :1055 NA's :1081 NA's :1048
## X04 X05 X06 X07
## Min. : -0.20 Min. : -0.10 Min. : -0.20 Min. : 0.00
## 1st Qu.: 1.11 1st Qu.: 1.20 1st Qu.: 1.40 1st Qu.: 1.90
## Median : 4.90 Median : 5.10 Median : 6.40 Median : 8.50
## Mean : 29.53 Mean : 29.15 Mean : 29.73 Mean : 30.64
## 3rd Qu.: 29.00 3rd Qu.: 29.00 3rd Qu.: 30.00 3rd Qu.: 32.00
## Max. :357.00 Max. :358.00 Max. :386.00 Max. :355.00
## NA's :1050 NA's :1040 NA's :1036 NA's :1032
## X08 X09 X10 X11
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : -0.10
## 1st Qu.: 2.00 1st Qu.: 2.00 1st Qu.: 2.00 1st Qu.: 2.00
## Median : 9.40 Median : 9.40 Median : 9.00 Median : 9.30
## Mean : 31.18 Mean : 31.87 Mean : 33.14 Mean : 33.89
## 3rd Qu.: 34.50 3rd Qu.: 36.00 3rd Qu.: 36.00 3rd Qu.: 36.00
## Max. :359.00 Max. :360.00 Max. :359.00 Max. :360.00
## NA's :1050 NA's :1083 NA's :1267 NA's :1284
## X12 X13 X14 X15
## Min. : -0.20 Min. : -0.10 Min. : -0.40 Min. : -0.20
## 1st Qu.: 2.00 1st Qu.: 2.00 1st Qu.: 2.00 1st Qu.: 1.90
## Median : 8.30 Median : 7.30 Median : 6.60 Median : 7.00
## Mean : 33.05 Mean : 32.78 Mean : 32.32 Mean : 31.62
## 3rd Qu.: 35.00 3rd Qu.: 35.00 3rd Qu.: 34.00 3rd Qu.: 34.00
## Max. :359.00 Max. :359.00 Max. :360.00 Max. :360.00
## NA's :1183 NA's :1228 NA's :1246 NA's :1167
## X16 X17 X18 X19
## Min. : 0.00 Min. : -0.20 Min. : -0.10 Min. : 0.00
## 1st Qu.: 1.90 1st Qu.: 1.90 1st Qu.: 1.90 1st Qu.: 1.90
## Median : 7.20 Median : 7.65 Median : 7.10 Median : 7.00
## Mean : 30.29 Mean : 29.90 Mean : 29.32 Mean : 28.92
## 3rd Qu.: 34.00 3rd Qu.: 34.75 3rd Qu.: 35.00 3rd Qu.: 34.00
## Max. :360.00 Max. :359.00 Max. :359.00 Max. :356.00
## NA's :1088 NA's :1055 NA's :1059 NA's :1056
## X20 X21 X22 X23
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : -0.20
## 1st Qu.: 1.80 1st Qu.: 1.70 1st Qu.: 1.60 1st Qu.: 1.44
## Median : 6.80 Median : 6.00 Median : 6.00 Median : 5.60
## Mean : 29.09 Mean : 28.87 Mean : 29.06 Mean : 28.60
## 3rd Qu.: 34.00 3rd Qu.: 33.00 3rd Qu.: 32.00 3rd Qu.: 32.00
## Max. :358.00 Max. :360.00 Max. :358.00 Max. :358.00
## NA's :1050 NA's :1028 NA's :1056 NA's :1048
str(df)
## 'data.frame': 7665 obs. of 27 variables:
## $ date : Date, format: "2015-01-01" "2015-01-01" ...
## $ station: Factor w/ 1 level "Banqiao": 1 1 1 1 1 1 1 1 1 1 ...
## $ item : Factor w/ 21 levels "AMB_TEMP","CH4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ X00 : num 16 2.1 0.79 0.14 1.2 16 17 37 NA 177 ...
## $ X01 : num 16 2.1 0.8 0.15 1.3 16 17 36 NA 178 ...
## $ X02 : num 16 2.1 0.71 0.13 1 13 14 38 NA 163 ...
## $ X03 : num 15 2 0.66 0.12 0.8 11 12 39 NA 147 ...
## $ X04 : num 15 2 0.53 0.11 0.6 10 11 38 NA 131 ...
## $ X05 : num 14 2 0.5 0.11 0.8 11 12 37 NA 112 ...
## $ X06 : num 14 2 0.57 0.14 1.4 18 19 29 NA 103 ...
## $ X07 : num 14 2 0.61 0.14 2.3 17 20 27 NA 104 ...
## $ X08 : num 14 2 0.59 0.13 2.9 15 18 29 NA 111 ...
## $ X09 : num 14 2 0.6 0.14 4.1 16 20 29 NA 111 ...
## $ X10 : num 15 2 0.55 0.13 4.2 14 18 35 NA 100 ...
## $ X11 : num 14 2 0.51 0.11 3.8 13 17 39 NA 92 ...
## $ X12 : num 15 2 0.47 0.11 4.2 12 16 41 NA 83 ...
## $ X13 : num 14 2 0.42 0.09 2.9 10 13 43 NA 76 ...
## $ X14 : num 14 2 0.41 0.1 3 12 15 39 NA 73 ...
## $ X15 : num 13 2 0.42 0.1 2.4 13 16 36 NA 66 ...
## $ X16 : num 13 2 0.45 0.11 1.8 14 16 33 NA 61 ...
## $ X17 : num 13 2 0.46 0.12 2 16 18 31 NA 57 ...
## $ X18 : num 13 2 0.45 0.13 2.5 18 20 29 NA 50 ...
## $ X19 : num 13 2 0.48 0.15 2.8 21 24 24 NA 45 ...
## $ X20 : num 13 2 0.52 0.16 3.3 23 27 20 NA 45 ...
## $ X21 : num 13 2 0.43 0.13 3.2 19 23 22 NA 43 ...
## $ X22 : num 13 2 0.4 0.12 2.7 17 20 23 NA 40 ...
## $ X23 : num 13 2 0.35 0.11 1.6 14 15 26 NA 42 ...
The Hours are currently in columns, but these are observations (–> Rows!). The column “item” contains the variables per row –> Columns
I want to have the hours in rows. So I need a new column for ‘Hour’ (-> key = “Hour”) and the corresponding ‘Value’, currently spread of the column 4:27.
gatherDf <- gather(df, key = "Hour", value = "Value", 4:27 )
str(gatherDf)
## 'data.frame': 183960 obs. of 5 variables:
## $ date : Date, format: "2015-01-01" "2015-01-01" ...
## $ station: Factor w/ 1 level "Banqiao": 1 1 1 1 1 1 1 1 1 1 ...
## $ item : Factor w/ 21 levels "AMB_TEMP","CH4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Hour : chr "X00" "X00" "X00" "X00" ...
## $ Value : num 16 2.1 0.79 0.14 1.2 16 17 37 NA 177 ...
But if we want to have the indicators spread over columns - no problem - just use spread()-function from tidyR.
spreadDf <- spread(gatherDf, key = item, value = Value)
Let’s check the result for the dataset.
str(spreadDf)
## 'data.frame': 8760 obs. of 24 variables:
## $ date : Date, format: "2015-01-01" "2015-01-01" ...
## $ station : Factor w/ 1 level "Banqiao": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hour : chr "X00" "X01" "X02" "X03" ...
## $ AMB_TEMP : num 16 16 16 15 15 14 14 14 14 14 ...
## $ CH4 : num 2.1 2.1 2.1 2 2 2 2 2 2 2 ...
## $ CO : num 0.79 0.8 0.71 0.66 0.53 0.5 0.57 0.61 0.59 0.6 ...
## $ NMHC : num 0.14 0.15 0.13 0.12 0.11 0.11 0.14 0.14 0.13 0.14 ...
## $ NO : num 1.2 1.3 1 0.8 0.6 0.8 1.4 2.3 2.9 4.1 ...
## $ NO2 : num 16 16 13 11 10 11 18 17 15 16 ...
## $ NOx : num 17 17 14 12 11 12 19 20 18 20 ...
## $ O3 : num 37 36 38 39 38 37 29 27 29 29 ...
## $ PH_RAIN : num NA NA NA NA NA NA NA NA NA NA ...
## $ PM10 : num 177 178 163 147 131 112 103 104 111 111 ...
## $ PM2.5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ RAIN_COND : num NA NA NA NA NA NA NA NA NA NA ...
## $ RAINFALL : num NA NA NA NA NA NA NA NA NA NA ...
## $ RH : num 57 57 57 58 58 57 57 56 53 50 ...
## $ SO2 : num 12 11 8 6.5 5.5 5.7 5.8 6.5 6.8 5.9 ...
## $ THC : num 2.2 2.2 2.2 2.2 2.1 2.1 2.1 2.2 2.1 2.1 ...
## $ UVB : num 0 0 0 0 0 0 0 0.2 0.7 1.6 ...
## $ WD_HR : num 69 67 63 63 69 68 74 66 71 73 ...
## $ WIND_DIREC: num 69 65 53 63 67 62 70 72 73 68 ...
## $ WIND_SPEED: num 4.7 4 3.7 4.1 3 2.9 3.6 3.1 2.8 3.6 ...
## $ WS_HR : num 4.2 4 3.5 3.3 3.1 3.3 2.2 3 3.3 3.2 ...
Prevalence of HIV refers to the percentage of people ages 15-49 who are infected with HIV. Data source can be found from here.
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(tibble)
url <- getURL("https://raw.githubusercontent.com/ahmshahparan/DATA607_PROJECT02/master/indicator%20hiv%20estimated%20prevalence%25%2015-49.csv")
df <- read.csv(text = url, header = TRUE, check.names = FALSE)
colnames(df)[colnames(df) == "Estimated HIV Prevalence% - (Ages 15-49)"] <- "country"
hiv <- as.tibble(df) %>%
gather(`1979`:`2011`, key = year, value = prec, na.rm = TRUE)
Find maximum HIV percentage for each country
hiv %>%
group_by(country) %>%
summarise(max_perc = max(prec, na.rm = TRUE))
## # A tibble: 154 x 2
## country max_perc
## <fct> <dbl>
## 1 Afghanistan 0.0600
## 2 Algeria 0.100
## 3 Angola 2.10
## 4 Argentina 0.500
## 5 Armenia 0.200
## 6 Australia 0.200
## 7 Austria 0.400
## 8 Azerbaijan 0.100
## 9 Bahamas 3.90
## 10 Bangladesh 0.0600
## # ... with 144 more rows
Find the country with max HIV prevalence percentage
hiv %>%
group_by(country) %>%
summarise(max_perc = max(prec, na.rm = TRUE)) %>%
arrange(desc(max_perc))
## # A tibble: 154 x 2
## country max_perc
## <fct> <dbl>
## 1 Zimbabwe 26.5
## 2 Botswana 26.3
## 3 Swaziland 26.0
## 4 Lesotho 24.5
## 5 South Africa 18.1
## 6 Namibia 16.5
## 7 Zambia 15.0
## 8 Malawi 14.7
## 9 Mozambique 11.4
## 10 Uganda 10.7
## # ... with 144 more rows
Zimbabwe has the maximum HIV prevalence percentage followed by Botswana.
Scatter plot for HIV prevalence percentage:
hiv %>%
ggplot(aes(x = year, y = prec, group = country)) +
geom_jitter() +
coord_flip()