기말 프로젝트

# install.packages(c("readxl","ggplot2","dplyr"))

library(readxl)

library(ggplot2)

library(dplyr)

##프로젝트 내용 1970~2020년대까지 명곡이라고 평가받고 많은 사람들에게 사랑받는 노래들의 가사를 분석해서 사람들이 어떤 것에 대한 노래를 가장 좋아하는지에 대해 알아보고 시대적으로 변화되어가는 주제 또한 분석해 보았습니다

##자료를 모은 곳 빌보드에서 정리한 70-00년대의 인기곡 10곡들을 excel파일에 정리하였다 가사는 구글 검색을 통해 나오는 글을 모아 txt 파일로 만들었다 또한 function words가 정리된 파일을 찾지 못해서 일단 내가 만들어서 사용하였다.

#인기곡들을 뽑아온 것이기 때문에 가사 외에도 인기곡이 되는 것에 영향을 줄 수 있는 요소들: 가수, 그시대 인기있던 장르, 등을 정리하여 보기 쉽도록 하였습니다.

##1. 가수

read_excel("자료.xlsx")->songs
library(dplyr)
library(ggplot2)
frequency_singer <- table(songs$singer)
as.data.frame(frequency_singer) ->freq_singer
freq_singer <- freq_singer %>% rename(singer=Var1, freq= Freq)
ggplot(data = freq_singer, aes(x=singer, y= freq)) +geom_col()

그래프로 그리면 X축이 너무 많아서 보기 어려우니 정렬해서 보겠다다

freq_singer %>% arrange(desc(freq)) %>%  head(5)

##          singer freq
## 1         ac/dc    2
## 2        eminem    2
## 3 fleetwood mac    2
## 4       50 cent    1
## 5          a-ha    1

ac/dc, eminem, fleetwood mac이 2번씩 등장한 것을 볼 수 있었다.

read_excel("nationality.xlsx") -> nation

지도에서 각 나라별 가수의 수를 알아보고자 국적을 추가한 엑셀파일을 만들었다.

nation$nationality -> freq_contries
table(freq_contries) ->freq_na
freq_na

## freq_contries
## australia    canada   Ireland    norway        UK       USA 
##         3         1         1         1         7        27

as.data.frame(freq_na)->freq_countries
freq_countries

##   freq_contries Freq
## 1     australia    3
## 2        canada    1
## 3       Ireland    1
## 4        norway    1
## 5            UK    7
## 6           USA   27

library(dplyr)
freq_countries <-freq_countries %>%  rename(country= freq_contries, freq= Freq) 
ggplot(data = freq_countries, aes(x= country, y= freq)) +geom_col()

nation의 빈도를 보기위해 table을 만든 후 dataframe으로 변경하여 그래프를 그리었다.

usa가 가장 많음을 볼 수 있었다.

##장르별 비교

read_excel("genre.xlsx") ->genre
genre$genre ->genres
df_genres<- as.data.frame(table(genres))
df_genres

##            genres Freq
## 1     alternative    6
## 2  classical rock    2
## 3    country rock    2
## 4           disco    3
## 5       hard rock    5
## 6        new wave    2
## 7             pop    4
## 8        pop rock    3
## 9             r&b    2
## 10            rap    4
## 11           rock    7

ggplot(data = df_genres, aes(x=genres, y=Freq)) +geom_col()

2.70s에서 인기있던 가사의 단어 알아보기

library(readxl)
read_excel("자료.xlsx") ->songs

readLines("70s_song.txt",encoding = "UTF-8") ->lyrics_70s_raw

## Warning in readLines("70s_song.txt", encoding = "UTF-8"): '70s_song.txt'에서
## 불완전한 마지막 행이 발견되었습니다

lyrics_70s <- lyrics_70s_raw

우선 가사가 정리된 파일은 복사본을 만들었다

readLines("extr.txt") -> extr
#지울 단어들이 들어있는 파일 불러오기
clean_lyrics_70s <- gsub("[[:punct:]]", "", lyrics_70s)
clean_lyrics_70s <- tolower(clean_lyrics_70s)
words_70 <- strsplit(clean_lyrics_70s, "\\s+")
filtered_words_70 <- words_70[!words_70 %in% extr]
wordcount <- table(unlist(filtered_words_70))
df_words_70s <- as.data.frame(wordcount, stringsAsFactors =  F)
df_words_70s <- rename(df_words_70s, word = Var1, freq = Freq)

단어 정리 및 불러오기 까지 하였다다

library(wordcloud)

## Warning: 패키지 'wordcloud'는 R 버전 4.4.2에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: RColorBrewer

wordcloud(words = df_words_70s$word,
          freq = df_words_70s$freq,
          min.freq = 2,
          max.words = 200,
          random.order = F,
          rot.per = .1,
          scale = c(4, 0.3))

70년대의 워드 클라우드 왜인지 txt 파일에 적어둔 단어들이 삭제가 되지 않아서 you and the 와 같은 기능어들이 많이 보이는 것을 알 수 있다.

하지만 빼고 봤을 때 love가 크게 보이는 것을 알 수 있다.

come and get your love란 노래에 워낙 많이 나오는 단어 이므로 come and get your love를 제외하고 찾아보기로 하였다.

readLines("70s_song_wo_come_.txt") ->wo_70s

## Warning in readLines("70s_song_wo_come_.txt"): '70s_song_wo_come_.txt'에서
## 불완전한 마지막 행이 발견되었습니다

clean_lyrics_70s_wo <- gsub("[[:punct:]]", "", wo_70s)
clean_lyrics_70s_wo <- tolower(wo_70s)
words_70_wo <- strsplit(clean_lyrics_70s_wo, "\\s+")
filtered_words_70_wo <- words_70_wo[!words_70_wo %in% extr]
wordcount_wo <- table(unlist(filtered_words_70_wo))
df_words_70s_wo<- as.data.frame(wordcount_wo, stringsAsFactors =  F)
df_words_70s_wo <- rename(df_words_70s_wo, word = Var1, freq = Freq)

wordcloud(words = df_words_70s_wo$word,
          freq = df_words_70s_wo$freq,
          min.freq = 2,
          max.words = 200,
          random.order = F,
          rot.per = .1,
          scale = c(4, 0.3))

그래도 love가 꽤나 큰 부분을 차지 하는 것을 알 수 있었다.

##80,90년대 각년대의 가사 분위기 비교를위해 위의 과정을 반복하였다

readLines("80s_song.txt") ->lyrics_80s

## Warning in readLines("80s_song.txt"): '80s_song.txt'에서 불완전한 마지막 행이
## 발견되었습니다

readLines("90s_song.txt") ->lyrics_90s

## Warning in readLines("90s_song.txt"): '90s_song.txt'에서 불완전한 마지막 행이
## 발견되었습니다

clean_lyrics_80s <- gsub("[[:punct:]]", "", lyrics_80s)
clean_lyrics_80s <- tolower(clean_lyrics_80s)
words_80 <- strsplit(clean_lyrics_80s, "\\s+")

clean_lyrics_90s <- gsub("[[:punct:]]", "", lyrics_90s)
clean_lyrics_90s <- tolower(clean_lyrics_90s)
words_90 <- strsplit(clean_lyrics_90s, "\\s+")

이 과정까지는 동일하게 하였고 na가 들어가는 가사가 많았기 때문에 제거하기 위하여 extr 파일에 na나 do를 추가한 extr2 파일을 만들었다 또한 stop_words 묶음도 만들어 안 될 경우를 대비하였다

readLines("extract2.txt") -> extr2

## Warning in readLines("extract2.txt"): 'extract2.txt'에서 불완전한 마지막 행이
## 발견되었습니다

stop_words <-c("a","an","the","you","i","they", "them", "their", "for", "in", "an","a", "as","na","yes", "am", "are", "were","was")
filtered_words_80 <- words_80[!words_80 %in% stop_words]


filtered_words_90 <- words_90[!words_90 %in% extr2]

wordcount <- table(unlist(filtered_words_80))
df_words_80s <- as.data.frame(wordcount, stringsAsFactors =  F)
df_words_80s <- rename(df_words_80s, word = Var1, freq = Freq)

wordcloud(words = df_words_80s$word,
          freq = df_words_80s$freq,
          min.freq = 2,
          max.words = 200,
          random.order = F,
          rot.per = .1,
          scale = c(4, 0.3))

wordcount <- table(unlist(filtered_words_90))
df_words_90s <- as.data.frame(wordcount, stringsAsFactors =  F)
df_words_90s <- rename(df_words_90s, word = Var1, freq = Freq)

wordcloud(words = df_words_90s$word,
          freq = df_words_90s$freq,
          min.freq = 2,
          max.words = 200,
          random.order = F,
          rot.per = .1,
          scale = c(4, 0.3))

#00년대

readLines("00s.txt") ->lyrics_00s

## Warning in readLines("00s.txt"): '00s.txt'에서 불완전한 마지막 행이
## 발견되었습니다

clean_lyric_00s <- gsub("[[:punct:]]", "", lyrics_00s)
clean_lyric_00s <- tolower(clean_lyric_00s)
words_00 <- strsplit(clean_lyric_00s, "\\s+")

708090년대와 동일한 방법으로 진행하였다.

stop_words2 <-c("a","an","the","you","i","they", "them", "their", "for", "in", "an","a", "as","na","yes", "am", "are", "were","was", "yeah", "yes", "without")
filtered_words_00 <- words_00[!words_00 %in% stop_words2]

00년대 음악은 어셔의 노래와 같이 yeah가 계속 반복되기 때문에 결과값을 방해하지 않도록 stop words에 포함시켜 제거하였다.

wordcount <- table(unlist(filtered_words_00))
df_words_00s <- as.data.frame(wordcount, stringsAsFactors =  F)
df_words_00s <- rename(df_words_00s, word = Var1, freq = Freq)

wordcloud(words = df_words_00s$word,
          freq = df_words_00s$freq,
          min.freq = 2,
          max.words = 200,
          random.order = F,
          rot.per = .1,
          scale = c(4, 0.3))

워드 클라우드와는 달리 수치로 조금 더 자세히 확인해보기로 하였다.

그래프로 그리기엔 단어양이 너무 많기 때문에 가장 많이 쓰인 단어를 50개 정도만 추려서 그래프를 그려보도록 하겠다

library(plotly)

## Warning: 패키지 'plotly'는 R 버전 4.4.2에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

df_words_70s <- df_words_70s %>% mutate(prop.table(freq))

df_words_70s <- df_words_70s %>% arrange(desc(`prop.table(freq)`)) %>% head(50)


dw7 <-ggplot(data = df_words_70s, aes(x= word, y= `prop.table(freq)`)) +geom_point()

ggplotly(dw7)

이런 식으로 하면 70년대 자주쓰인 단어 50개를 뽑아 그래프를 그릴 수 있다. x축에 단어를 y축에 빈도율을 나타내었다. 단어가 너무 많아서 안 보이는 점을 보완하기 위하여 interactive 표로 만들었다.

##80년대

df_words_80s <- df_words_80s %>% mutate(prop.table(freq))

df_words_80s <- df_words_80s %>% arrange(desc(`prop.table(freq)`)) %>% head(50)


dw8 <-ggplot(data = df_words_80s, aes(x= word, y= `prop.table(freq)`)) +geom_point()

ggplotly(dw8)

##90년대

df_words_90s <- df_words_90s %>% mutate(prop.table(freq))

df_words_90s <- df_words_90s %>% arrange(desc(`prop.table(freq)`)) %>% head(50)


dw7 <-ggplot(data = df_words_90s, aes(x= word, y= `prop.table(freq)`)) +geom_point()

ggplotly(dw7)

##00년대

df_words_00s <- df_words_00s %>% mutate(prop.table(freq))

df_words_00s <- df_words_00s %>% arrange(desc(`prop.table(freq)`)) %>% head(50)


dw0 <-ggplot(data = df_words_00s, aes(x= word, y= `prop.table(freq)`)) +geom_point()

ggplotly(dw0)

결론과 더 알아보고 싶은 점

결론 우선 워드 클라우드상으로 봤을 때는 708090년대의 노래의 중심이 되었던 love라는 단어가 점차 사라진다는 점이 두드러졌다. 이러한 양상은 노래의 주제는 시대에 따라 점차 변한다는 점을 보여주었다.
더욱 궁금한 점은 수치로 나타내보았을 때는 개인적으로 70년대에는 you라는 대명사의 비율이 굉장히 높은데 80년대부터 줄기 시작하였고 오히려 90년대부터 I가 굉장히 많이 사용되었다는 점이 흥미로웠다. 이러한 대명사의 사용이 개인주의의 확산과도 연관이 있을지 궁금해졌다.

기말 프로젝트

20230089윤현정

2024-12-10

2.70s에서 인기있던 가사의 단어 알아보기

결론과 더 알아보고 싶은 점