Environment Prep

if (!require('dplyr')) install.packages('dplyr')
if (!require('tidyr')) install.packages('tidyr')
if (!require('DT')) install.packages('DT')
if (!require('ggplot2')) install.packages('ggplot2')
if (!require('xlsx')) install.packages('xlsx')
if (!require('reshape2')) install.packages('reshape2')

Dataset 1: Occupational Wages for the New York City Region

Ref: labor.ny.gov
Data Source: Employment and wage data by occupation are based on the Occupational Employment Statistics (OES) survey, which collects information from approximately 51,000 businesses. Estimates are based on responses from six semi-annual panels collected between November 2014 and May 2017. Wages were then updated to the first quarter of 2018 by making cost-of-living adjustments. Occupational employment and wages are presented for New York State and its 10 labor market regions.
Data Citation: Occupational Wages for the New York City Region: November,2014 and May, 2017 Source: labor.ny.gov
Additional Information: The entire set of currently available OES employment and wage data is available by downloading the file

Abstract

Let’s revise the original question offerings, knowing that these are projections based on sample data of 51000 business. * Most no.jobs available at based on occupations category? * Which top 10 jobs are high paying job?

Data Import

labor <- read.csv("wagesCSV.csv", header=TRUE,stringsAsFactors = FALSE)
datatable(labor)

## Data Prep

#remove NA
df<- na.omit(labor)
datatable(df)

# delete All Total in category 
clean_data<- df[!grepl("-0000", df$SOCCode),]
datatable(clean_data)

Analysis (Including Plots)

#for top 10 occupations by avalible job 
df_e<-as.data.frame(head(arrange(clean_data,desc(Employment)), n = 10))
lst_Employ <- df_e[] %>% 
    select(Title, Employment) 

# Plot
lst_Employ[] %>%
  arrange(Employment) %>%
  mutate(Title=factor(Title, levels=Title)) %>%
  ggplot( aes(x=Title, y=Employment)) +
    geom_segment( aes(xend=Title, yend=1)) +
    geom_point( size=4, color="orange") +
    coord_flip() + labs(title="Top 10 Occupations by avalible jobs") +
    theme_minimal()

#Higest Paying job
highest_mean<-as.data.frame(head(arrange(clean_data,desc(Mean)), n = 10))
datatable(highest_mean)

highest_mean[] %>%
  arrange(Mean) %>%
  mutate(Title=factor(Title, levels=Title)) %>%
  ggplot( aes(x=Title, y=Mean)) +
    geom_segment( aes(xend=Title, yend=1)) +
    geom_point( size=4, color="orange") +
    coord_flip() + labs(title="Top 10 Occupations by Highest Mean Salary")+ 
    theme_minimal()

Dataset 2: Predicting The Mega Millions Lottery

Data Source: Lottery Mega Millions Winning Numbers Repository Source
Data Citation: Data.gov Link

Abstract

Can you really predict the winning lottery numbers? I am going to find some of the most common winning lottery numbers in last 10 years.

Data Import

Lottery Mega Millions Winning Numbers: Beginning 2002. I used that file to create the .csv imported below.

lotto <- read.csv("Lottery_Mega_Millions_Winning_Numbers__Beginning_2002.csv", header=TRUE,stringsAsFactors = FALSE)
head(lotto, n=20)

   Draw.Date Winning.Numbers Mega.Ball Multiplier
1  5/17/2002  15 18 25 33 47        30         NA
2  5/21/2002  04 28 39 41 44         9         NA
3  5/24/2002  02 04 32 44 52        36         NA
4  5/28/2002  06 21 22 29 32        24         NA
5  5/31/2002  12 28 45 46 52        47         NA
6   6/4/2002  03 25 29 30 48        48         NA
7   6/7/2002  14 22 27 28 42        13         NA
8  6/11/2002  05 06 09 33 44        52         NA
9  6/14/2002  04 08 32 37 43         2         NA
10 6/18/2002  06 13 18 27 45        18         NA
11 6/21/2002  13 18 32 39 49         6         NA
12 6/25/2002  04 18 21 27 41        50         NA
13 6/28/2002  18 31 49 50 51         4         NA
14  7/2/2002  14 22 32 35 44         6         NA
15  7/5/2002  11 20 26 29 41        41         NA
16  7/9/2002  26 29 31 44 48        40         NA
17 7/12/2002  13 19 23 38 47        15         NA
18 7/16/2002  10 24 35 49 52        47         NA
19 7/19/2002  07 15 24 37 46         9         NA
20 7/23/2002  10 12 29 32 38         7         NA

Data Prep

We are only interested in a few variables here, Winning Numbers And Mega Ball Numbers from 2008. we will omit old data Because of the way the Winning Number are merged in one column, it is not usable data.

# deleting old data
new_data<- lotto[!grepl("/2002", lotto$Draw.Date),] 
new_data<- new_data[!grepl("/2003", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2004", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2005", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2006", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2007", new_data$Draw.Date),]
head(new_data)

##     Draw.Date Winning.Numbers Mega.Ball Multiplier
## 587  1/1/2008  13 16 25 30 54        11         NA
## 588  1/4/2008  24 31 39 40 56        32         NA
## 589  1/8/2008  10 29 45 52 54        10         NA
## 590 1/11/2008  22 36 42 45 55        42         NA
## 591 1/15/2008  21 30 42 44 50         6         NA
## 592 1/18/2008  12 22 33 43 44        15         NA

#Selecting Winning Number Column
lotto_No <- as.data.frame(new_data[ ]%>% 
    select(Winning.Numbers))
head(lotto_No)

##     Winning.Numbers
## 587  13 16 25 30 54
## 588  24 31 39 40 56
## 589  10 29 45 52 54
## 590  22 36 42 45 55
## 591  21 30 42 44 50
## 592  12 22 33 43 44

#Seleting Mega Ball Number Column
mega_ball<-as.data.frame(new_data[ ]%>% 
    select(Mega.Ball))
head(mega_ball)

##     Mega.Ball
## 587        11
## 588        32
## 589        10
## 590        42
## 591         6
## 592        15

#seperating each Numbers
temp <- strsplit(lotto_No$Winning.Numbers, " ")

#data frame by occurence
Number_frequecy  <- as.data.frame(table(unlist(temp)))
head(Number_frequecy)

##   Var1 Freq
## 1   01   89
## 2   02  113
## 3   03   95
## 4   04  100
## 5   05   84
## 6   06   86

# mega ball 
Mega_ball_Frequency <-as.data.frame(table(mega_ball))
head(Mega_ball_Frequency)

##   mega_ball Freq
## 1         1   44
## 2         2   42
## 3         3   48
## 4         4   41
## 5         5   31
## 6         6   47

#most common five numbers occurence in winning numbers is 
most_won_no <- head(arrange(Number_frequecy,desc(Freq)), n = 5)
five_number <- c(sort(most_won_no$Var1))
five_number

## [1]  2 11 17 20 31

# most occured mega ball Number
tmp <- Mega_ball_Frequency[Mega_ball_Frequency$Freq == (max(Mega_ball_Frequency$Freq)), ]
p_mega <- c(tmp$mega_ball)
p_mega

## [1] 9

Result

Most five winning numbers and Mega Ball are 2, 11, 17, 20, 31 AND 9

Dataset 3: New York City Leading Causes of Death

Abstract

The leading causes of death by sex and ethnicity in New York City in since 2007.

I am going to find some of fact behind causes of death.

Data Import

The leading causes of death by sex and ethnicity in New York City in since 2007. I used that file to create the .csv imported below.

df <- read.csv("New_York_City_Leading_Causes_of_Death.csv", header=TRUE,stringsAsFactors = FALSE)
datatable(head(df, n=20))

Data Prep

#remove NA
ny_cause_death<- na.omit(df)
datatable(ny_cause_death)

#2014
data_2014<- ny_cause_death[grepl("2014", ny_cause_death$Year),] 

data_2014<- as.data.frame(data_2014[ ]%>% 
    select( Leading.Cause, Sex,Race.Ethnicity,Death.Rate ))


data_2014$Race.Ethnicity <-factor(data_2014$Race.Ethnicity)
data_2014$Sex<-factor(data_2014$Sex)
data_2014$Leading.Cause<-factor(data_2014$Leading.Cause)


xx<-spread(data_2014, Sex, Death.Rate)
xx<-na.omit(xx)
datatable(xx)

x1<-spread(data_2014, Race.Ethnicity, Death.Rate)
x1<-na.omit(x1)
datatable(x1)

#2014 male
data_2014_male <-x1[grepl("M", x1$Sex),]
datatable(data_2014_male)

#2014 female
data_2014_female <-x1[!grepl("M", x1$Sex),]
datatable(data_2014_female)

ggplot(data_2014, aes(fill=Race.Ethnicity, y=Death.Rate, x=Leading.Cause)) + geom_bar(position="dodge", stat="identity")+theme(axis.text.x = element_text(angle=90, hjust=1))

result

Heart<-x1[grepl("Diseases of Heart", x1$Leading.Cause),]
Heart

##        Leading.Cause Sex Asian and Pacific Islander Black Non-Hispanic
## 15 Diseases of Heart   F                        738               2091
## 16 Diseases of Heart   M                        965               2268
##    Hispanic White Non-Hispanic
## 15      971                318
## 16     1073               2971

a<- mutate(Heart, total= rowSums (Heart[3:6], na.rm = FALSE, dims = 1))
datatable(a)

ratio <- a[[2,7]]/a[[1,7]]

pie(a$total, labels = c("Female","Male"), radius = 1, main= "Diseases of Heart")

DATA 607: Tidy Data, Project 2

VINAYAK PATEL

October 7, 2018

Environment Prep

Dataset 1: Occupational Wages for the New York City Region

Abstract

Data Import

Analysis (Including Plots)

Dataset 2: Predicting The Mega Millions Lottery

Abstract

Data Import

Data Prep

Result

Most five winning numbers and Mega Ball are 2, 11, 17, 20, 31 AND 9

Dataset 3: New York City Leading Causes of Death

Abstract

Data Import

Data Prep

result

Ratio of heart disease between males and females is 1.76712 : 1