library(dplyr)
library(rvest)
library(prettifyAddins)
library(ggplot2)
library(stringr)
library(sqldf)
html_form_page <- 'http://www.whattoexpect.com/baby-names/list/top-baby-names-for-boys/###top-names' 

Reading the HTML code

webpage <- read_html(html_form_page)

summary(webpage)
##      Length Class  Mode       
## node 1      -none- externalptr
## doc  1      -none- externalptr
head(webpage)
## $node
## <pointer: 0x0000000017a89170>
## 
## $doc
## <pointer: 0x0000000017fa9fd0>
names <- webpage %>%
  html_nodes("li")%>%
  html_text()

Creating a visualization

head(names) 
## [1] "  Log In / Join "                                                                                                                                        
## [2] " Getting Pregnant    Fertility  Ovulation  Preparing for Pregnancy "                                                                                     
## [3] " Fertility "                                                                                                                                             
## [4] " Ovulation "                                                                                                                                             
## [5] " Preparing for Pregnancy "                                                                                                                               
## [6] " Pregnancy    Week By Week  Symptoms  Baby Names  Baby Shower  Complications  Due Date Calculator  Labor & Delivery  Screenings & Tests  Signs of Labor "
tail(names)
## [1] "What to Expect Bookstore"            "Advertising Policy"                 
## [3] "Do Not Sell My Personal Information" "Help"                               
## [5] " AdChoices "                         "Feedback"
class(names)
## [1] "character"

This dataset is not a dataframe; we need to turn it into one.

names <- data.frame(names)

head(names)
##                                                                                                                                                      names
## 1                                                                                                                                           Log In / Join 
## 2                                                                                       Getting Pregnant    Fertility  Ovulation  Preparing for Pregnancy 
## 3                                                                                                                                               Fertility 
## 4                                                                                                                                               Ovulation 
## 5                                                                                                                                 Preparing for Pregnancy 
## 6  Pregnancy    Week By Week  Symptoms  Baby Names  Baby Shower  Complications  Due Date Calculator  Labor & Delivery  Screenings & Tests  Signs of Labor
class(names)
## [1] "data.frame"

Need to get rid of rows 1-65, and potentially more

names <- names[-1:-65,]

head(names)
## [1] Liam     Noah     William  James    Oliver   Benjamin
## 1083 Levels:    Log In / Join   AdChoices  ... Zyaire
class(names)###factor?
## [1] "factor"
names <- data.frame(names)

names <- names[-1002:-1021,] ###I just wanted to make sure I didn't delete any names

2 extra rows as predicted

names <- data.frame(names)

names <- names[-1001:-1002,]

Baby_Boy_Names_2020 <- names

Ranking <- c(1:1000)

names_ranked <- cbind.data.frame(Ranking,Baby_Boy_Names_2020)

head(names_ranked)
##   Ranking Baby_Boy_Names_2020
## 1       1                Liam
## 2       2                Noah
## 3       3             William
## 4       4               James
## 5       5              Oliver
## 6       6            Benjamin

##————————— Now you want to remove any names that do not meet your conditions.—————————————

1. How many boys were born in said year? 2019 is the only year I gathered data for number of births using this number for the 2020 baby names, because it seems the baby names were from the 2019 data and used as 2020 names.

Around 37,308,668 boys and 35,730,482 girls according to: https://datacenter.kidscount.org/data/tables/102-child-population-by-gender#detailed/1/any/false/1729,37,871,870,573,869,36,868,867,133/14,15,65/421,422

Percentage/number of boys with top 2 names: Liam and Noah in 2019?

.010741 * 37308668
## [1] 400732.4
.009979 * 37308668
## [1] 372303.2

400,732 We won’t count the .4 of a person

372,303 We won’t count the .2 of a person

These top two names I will exclude from the data set.

Percentage/number of girls with top name: Olivia 2020?

.0010122 * 35730482
## [1] 36166.39

36,166 Once again, we won’t count the .39 of a person. Since I don’t have a dataset of girl names I will not finish this analysis…

however, I would like to look at it in the future! Note:This post took more time than I initially anticipated, please request more if you like it.

For boys names, I am going to strip out the top two: Liam and Noah

names_ranked <- names_ranked[-1:-2,]

head(names_ranked)###William is the new top name for 2020
##   Ranking Baby_Boy_Names_2020
## 3       3             William
## 4       4               James
## 5       5              Oliver
## 6       6            Benjamin
## 7       7              Elijah
## 8       8               Lucas

2. Cannot be the same as my name, my parents, or etc, etc

Taylor, cristiano, Madona, Shaquille, Jojo, Messi, Michael

Do a text string search for those names or names like it…in sQL…

family_names <-c("Taylor", "cristiano", "Madona", "Shaquille", "Jojo", "Messi", "Michael",
                ###adding on abbreviated names
                "Tay","Chris","Shaqy","Jo","Mike")

sqldf("SELECT *
      FROM names_ranked
      where Baby_Boy_Names_2020 = 'William'")
##   Ranking Baby_Boy_Names_2020
## 1       3             William
new_boy_names <- sqldf("SELECT *
      FROM names_ranked
      WHERE Baby_Boy_Names_2020 NOT IN(
       'Taylor', 
        'cristiano', 
        'Madona', 
        'Shaquille', 
        'Jojo', 
        'Messi', 
        'Michael',
        'Tay',
        'Chris',
        'Shaqy',
        'Jo',
        'Mike',
        'Nate')
        AND Baby_Boy_Names_2020 NOT IN(SELECT Baby_Boy_Names_2020
              FROM names_ranked 
              WHERE (Baby_Boy_Names_2020 LIKE '%r%'
              OR Baby_Boy_Names_2020 LIKE '%y%'
              OR Baby_Boy_Names_2020 LIKE '%i%'))
        AND (Baby_Boy_Names_2020 LIKE '%a%'
              OR Baby_Boy_Names_2020 LIKE '%e%')
      ")

class(new_boy_names)
## [1] "data.frame"
head(new_boy_names)
##   Ranking Baby_Boy_Names_2020
## 1       4               James
## 2       8               Lucas
## 3       9               Mason
## 4      10               Logan
## 5      12               Ethan
## 6      13               Jacob
summary(new_boy_names)
##     Ranking      Baby_Boy_Names_2020
##  Min.   :  4.0   Abdullah:  1       
##  1st Qu.:216.2   Abel    :  1       
##  Median :445.5   Ace     :  1       
##  Mean   :461.6   Adam    :  1       
##  3rd Qu.:691.2   Adan    :  1       
##  Max.   :999.0   Aden    :  1       
##                  (Other) :264
knitr::kable(new_boy_names, caption = "Dataset After Meeting Specified Conditions")
Dataset After Meeting Specified Conditions
Ranking Baby_Boy_Names_2020
4 James
8 Lucas
9 Mason
10 Logan
12 Ethan
13 Jacob
17 Jackson
20 Matthew
21 Samuel
23 Joseph
25 Owen
28 Jack
29 Luke
37 Mateo
39 Jaxon
41 Joshua
45 Caleb
48 Nathan
49 Thomas
50 Leo
61 Landon
63 Jonathan
64 Nolan
66 Easton
72 Angel
76 Jaxson
78 Adam
86 Evan
89 Jose
90 Jace
91 Jameson
94 Axel
100 Jason
101 Declan
102 Weston
106 Luca
112 Chase
114 Emmett
118 Cole
120 Bennett
128 Ashton
132 Gael
135 Maxwell
136 Max
139 Juan
140 Maddox
145 Jonah
146 Abel
148 Jesus
151 Beau
152 Camden
153 Alex
157 Jude
158 Blake
159 Emmanuel
170 August
172 Alan
173 Dean
185 Jesse
187 Joel
194 Dawson
196 Matteo
198 Steven
200 Zane
202 Judah
207 Kaleb
214 Jax
216 Holden
217 Legend
220 Kaden
221 Paxton
225 Josue
226 Kenneth
227 Beckett
228 Enzo
233 Lukas
234 Paul
237 Caden
238 Leon
243 Theo
246 Jaden
255 Ace
256 Nash
262 Jake
269 Sean
270 Chance
276 Cash
284 Stephen
287 Dallas
289 Manuel
290 Lane
291 Atlas
293 Jensen
295 Beckham
296 Daxton
304 Jett
305 Cohen
316 Dante
319 Kane
320 Luka
321 Kash
323 Desmond
324 Donovan
330 Angelo
345 Muhammad
346 Jaxton
349 Dakota
351 Keegan
355 Kade
357 Leonel
361 Wade
370 Jase
371 Lennox
372 Shane
376 Seth
379 Lawson
381 Gage
385 Cade
386 Johnathan
393 Shawn
394 Malcolm
397 Dalton
403 Kason
405 Noel
419 Leland
420 Pablo
421 Allen
427 Damon
428 Emanuel
431 Bowen
434 Kasen
437 Jonas
438 Sage
440 Esteban
442 Kashton
449 Adan
453 Dax
454 Mohamed
456 Kamden
457 Hank
460 Augustus
465 Benson
472 Alonzo
473 Landen
486 Deacon
488 Eden
495 Tate
499 Moses
506 Case
508 Asa
511 Aden
517 Apollo
526 Donald
528 Saul
531 Duke
533 Tatum
534 Ahmed
535 Moshe
538 Cannon
539 Alec
541 Keaton
547 Samson
550 Cason
551 Ahmad
552 Jalen
557 Callum
570 Callen
574 Kobe
577 Mathew
579 Johan
582 Stetson
588 Callan
589 Cullen
593 Kannon
595 Axton
603 Sam
605 Mohammad
607 Gustavo
612 Hamza
617 Kellan
619 Kase
625 Kohen
627 Mohammed
630 Lucca
632 Mack
638 Alden
642 Zeke
650 Lance
655 Amos
660 Casen
661 Colten
667 Devon
669 Boone
671 Nelson
672 Douglas
675 Lennon
679 Noe
682 Lochlan
685 Langston
686 Lachlan
688 Abdullah
689 Lee
692 Ben
695 Joe
699 Kellen
701 Jakob
708 Tomas
710 Thaddeus
711 Watson
714 Koda
716 Nathanael
732 Santana
735 Wells
741 Axl
745 Musa
747 Enoch
750 Talon
756 Dane
765 Hassan
766 Jamal
772 Kole
775 Alonso
777 Madden
778 Allan
780 Jaxen
782 Magnus
784 Dash
798 Jaxxon
809 Keanu
816 Koa
818 Coen
827 Van
829 Canaan
836 Maxton
837 Tadeo
839 Aldo
853 Blaze
855 Kace
862 Eugene
866 Nova
873 Kenzo
878 Stefan
879 Wallace
881 Kendall
885 Anson
886 Gannon
890 Dangelo
893 Bentlee
897 Chad
899 Mustafa
912 Wesson
913 Alfonso
916 Juelz
917 Duncan
918 Keagan
919 Deshawn
920 Bode
926 Keenan
928 Jaxx
936 Heath
939 Elon
943 Maddux
948 Vance
949 Boden
969 Jad
975 Zev
983 Deangelo
986 Kalel
998 Benton
999 Coleman