if (!require('dplyr')) install.packages('dplyr')
if (!require('tidyr')) install.packages('tidyr')
if (!require('DT')) install.packages('DT')
if (!require('ggplot2')) install.packages('ggplot2')
if (!require('xlsx')) install.packages('xlsx')
if (!require('reshape2')) install.packages('reshape2')
Let’s revise the original question offerings, knowing that these are projections based on sample data of 51000 business. * Most no.jobs available at based on occupations category? * Which top 10 jobs are high paying job?
labor <- read.csv("wagesCSV.csv", header=TRUE,stringsAsFactors = FALSE)
datatable(labor)
## Data Prep
#remove NA
df<- na.omit(labor)
datatable(df)
# delete All Total in category
clean_data<- df[!grepl("-0000", df$SOCCode),]
datatable(clean_data)
#for top 10 occupations by avalible job
df_e<-as.data.frame(head(arrange(clean_data,desc(Employment)), n = 10))
lst_Employ <- df_e[] %>%
select(Title, Employment)
# Plot
lst_Employ[] %>%
arrange(Employment) %>%
mutate(Title=factor(Title, levels=Title)) %>%
ggplot( aes(x=Title, y=Employment)) +
geom_segment( aes(xend=Title, yend=1)) +
geom_point( size=4, color="orange") +
coord_flip() + labs(title="Top 10 Occupations by avalible jobs") +
theme_minimal()
#Higest Paying job
highest_mean<-as.data.frame(head(arrange(clean_data,desc(Mean)), n = 10))
datatable(highest_mean)
highest_mean[] %>%
arrange(Mean) %>%
mutate(Title=factor(Title, levels=Title)) %>%
ggplot( aes(x=Title, y=Mean)) +
geom_segment( aes(xend=Title, yend=1)) +
geom_point( size=4, color="orange") +
coord_flip() + labs(title="Top 10 Occupations by Highest Mean Salary")+
theme_minimal()
Can you really predict the winning lottery numbers? I am going to find some of the most common winning lottery numbers in last 10 years.
Lottery Mega Millions Winning Numbers: Beginning 2002. I used that file to create the .csv imported below.
lotto <- read.csv("Lottery_Mega_Millions_Winning_Numbers__Beginning_2002.csv", header=TRUE,stringsAsFactors = FALSE)
head(lotto, n=20)
Draw.Date Winning.Numbers Mega.Ball Multiplier
1 5/17/2002 15 18 25 33 47 30 NA
2 5/21/2002 04 28 39 41 44 9 NA
3 5/24/2002 02 04 32 44 52 36 NA
4 5/28/2002 06 21 22 29 32 24 NA
5 5/31/2002 12 28 45 46 52 47 NA
6 6/4/2002 03 25 29 30 48 48 NA
7 6/7/2002 14 22 27 28 42 13 NA
8 6/11/2002 05 06 09 33 44 52 NA
9 6/14/2002 04 08 32 37 43 2 NA
10 6/18/2002 06 13 18 27 45 18 NA
11 6/21/2002 13 18 32 39 49 6 NA
12 6/25/2002 04 18 21 27 41 50 NA
13 6/28/2002 18 31 49 50 51 4 NA
14 7/2/2002 14 22 32 35 44 6 NA
15 7/5/2002 11 20 26 29 41 41 NA
16 7/9/2002 26 29 31 44 48 40 NA
17 7/12/2002 13 19 23 38 47 15 NA
18 7/16/2002 10 24 35 49 52 47 NA
19 7/19/2002 07 15 24 37 46 9 NA
20 7/23/2002 10 12 29 32 38 7 NA
We are only interested in a few variables here, Winning Numbers And Mega Ball Numbers from 2008. we will omit old data Because of the way the Winning Number are merged in one column, it is not usable data.
# deleting old data
new_data<- lotto[!grepl("/2002", lotto$Draw.Date),]
new_data<- new_data[!grepl("/2003", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2004", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2005", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2006", new_data$Draw.Date),]
new_data<- new_data[!grepl("/2007", new_data$Draw.Date),]
head(new_data)
## Draw.Date Winning.Numbers Mega.Ball Multiplier
## 587 1/1/2008 13 16 25 30 54 11 NA
## 588 1/4/2008 24 31 39 40 56 32 NA
## 589 1/8/2008 10 29 45 52 54 10 NA
## 590 1/11/2008 22 36 42 45 55 42 NA
## 591 1/15/2008 21 30 42 44 50 6 NA
## 592 1/18/2008 12 22 33 43 44 15 NA
#Selecting Winning Number Column
lotto_No <- as.data.frame(new_data[ ]%>%
select(Winning.Numbers))
head(lotto_No)
## Winning.Numbers
## 587 13 16 25 30 54
## 588 24 31 39 40 56
## 589 10 29 45 52 54
## 590 22 36 42 45 55
## 591 21 30 42 44 50
## 592 12 22 33 43 44
#Seleting Mega Ball Number Column
mega_ball<-as.data.frame(new_data[ ]%>%
select(Mega.Ball))
head(mega_ball)
## Mega.Ball
## 587 11
## 588 32
## 589 10
## 590 42
## 591 6
## 592 15
#seperating each Numbers
temp <- strsplit(lotto_No$Winning.Numbers, " ")
#data frame by occurence
Number_frequecy <- as.data.frame(table(unlist(temp)))
head(Number_frequecy)
## Var1 Freq
## 1 01 89
## 2 02 113
## 3 03 95
## 4 04 100
## 5 05 84
## 6 06 86
# mega ball
Mega_ball_Frequency <-as.data.frame(table(mega_ball))
head(Mega_ball_Frequency)
## mega_ball Freq
## 1 1 44
## 2 2 42
## 3 3 48
## 4 4 41
## 5 5 31
## 6 6 47
#most common five numbers occurence in winning numbers is
most_won_no <- head(arrange(Number_frequecy,desc(Freq)), n = 5)
five_number <- c(sort(most_won_no$Var1))
five_number
## [1] 2 11 17 20 31
# most occured mega ball Number
tmp <- Mega_ball_Frequency[Mega_ball_Frequency$Freq == (max(Mega_ball_Frequency$Freq)), ]
p_mega <- c(tmp$mega_ball)
p_mega
## [1] 9
The leading causes of death by sex and ethnicity in New York City in since 2007.
I am going to find some of fact behind causes of death.
The leading causes of death by sex and ethnicity in New York City in since 2007. I used that file to create the .csv imported below.
df <- read.csv("New_York_City_Leading_Causes_of_Death.csv", header=TRUE,stringsAsFactors = FALSE)
datatable(head(df, n=20))
#remove NA
ny_cause_death<- na.omit(df)
datatable(ny_cause_death)
#2014
data_2014<- ny_cause_death[grepl("2014", ny_cause_death$Year),]
data_2014<- as.data.frame(data_2014[ ]%>%
select( Leading.Cause, Sex,Race.Ethnicity,Death.Rate ))
data_2014$Race.Ethnicity <-factor(data_2014$Race.Ethnicity)
data_2014$Sex<-factor(data_2014$Sex)
data_2014$Leading.Cause<-factor(data_2014$Leading.Cause)
xx<-spread(data_2014, Sex, Death.Rate)
xx<-na.omit(xx)
datatable(xx)
x1<-spread(data_2014, Race.Ethnicity, Death.Rate)
x1<-na.omit(x1)
datatable(x1)
#2014 male
data_2014_male <-x1[grepl("M", x1$Sex),]
datatable(data_2014_male)
#2014 female
data_2014_female <-x1[!grepl("M", x1$Sex),]
datatable(data_2014_female)
ggplot(data_2014, aes(fill=Race.Ethnicity, y=Death.Rate, x=Leading.Cause)) + geom_bar(position="dodge", stat="identity")+theme(axis.text.x = element_text(angle=90, hjust=1))
Heart<-x1[grepl("Diseases of Heart", x1$Leading.Cause),]
Heart
## Leading.Cause Sex Asian and Pacific Islander Black Non-Hispanic
## 15 Diseases of Heart F 738 2091
## 16 Diseases of Heart M 965 2268
## Hispanic White Non-Hispanic
## 15 971 318
## 16 1073 2971
a<- mutate(Heart, total= rowSums (Heart[3:6], na.rm = FALSE, dims = 1))
datatable(a)
ratio <- a[[2,7]]/a[[1,7]]
pie(a$total, labels = c("Female","Male"), radius = 1, main= "Diseases of Heart")