λ€μ΄λ² λ΄μ€ μλν΄λΌμ°λ
μ΄μ κΈμμ μμ±νλ λ€μ΄λ² λ΄μ€ μ λͺ©ν¬λ‘€λ§ν λ΄μ©μ μ΄μ©ν΄μ ννμλΆμμ νμ¬ κ°λ¨ν μλ ν΄λΌμ°λλ₯Ό λ§λ€λ €κ³ νλ€.
μ£Όμ λͺ¨λ
-BeautifulSoup : ν¬λ‘€λ§
-requests : url get request
-Twitter : ννμ λΆμ
-Counter : ννμ count
-WordCloud : μλν΄λΌμ°λ μμ±
-matplotlib : μλν΄λΌμ°λ μΆλ ₯
from bs4 import BeautifulSoup
import requests
from konlpy.tag import Twitter
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
search_word = "μΌμ±" # κ²μμ΄ μ§μ
title_list = []
def get_titles(start_num, end_num):
#start_num ~ end_numκΉμ§ ν¬λ‘€λ§
while 1:
if start_num > end_num:
break
print(start_num)
url = 'https://search.naver.com/search.naver?where=news&sm=tab_jum&query={}&start={}'.format(search_word,start_num)
req = requests.get(url)
# μ μμ μΈ request νμΈ
if req.ok:
html = req.text
soup = BeautifulSoup(html, 'html.parser')
# λ΄μ€μ λͺ© λ½μμ€κΈ°
titles = soup.select(
'ul.type01 > li > dl > dt > a'
)
# listμ λ£μ΄μ€λ€
for title in titles:
title_list.append(title['title'])
start_num += 10
print(title_list)
def make_wordcloud(word_count):
twitter = Twitter()
sentences_tag = []
#ννμ λΆμνμ¬ λ¦¬μ€νΈμ λ£κΈ°
for sentence in title_list:
morph = twitter.pos(sentence)
sentences_tag.append(morph)
print(morph)
print('-' * 30)
print(sentences_tag)
print('\n' * 3)
noun_adj_list = []
#λͺ
μ¬μ νμ©μ¬λ§ ꡬλΆνμ¬ μ΄μ€νΈμ λ£κΈ°
for sentence1 in sentences_tag:
for word, tag in sentence1:
if tag in ['Noun', 'Adjective']:
noun_adj_list.append(word)
#ννμλ³ count
counts = Counter(noun_adj_list)
tags = counts.most_common(word_count)
print(tags)
#wordCloudμμ±
#νκΈκΊ μ§λ λ¬Έμ ν΄κ²°νκΈ°μν΄ font_path μ§μ
wc = WordCloud(font_path='/Library/Fonts/NanumSquareLight.ttf', background_color='white', width=800, height=600)
print(dict(tags))
cloud = wc.generate_from_frequencies(dict(tags))
plt.figure(figsize=(10, 8))
plt.axis('off')
plt.imshow(cloud)
plt.show()
if __name__ == '__main__':
#1~200λ²κ²μκΈ κΉμ§ ν¬λ‘€λ§
get_titles(1,200)
#λ¨μ΄ 30κ°κΉμ§ wordcloudλ‘ μΆλ ₯
make_wordcloud(30)β
μλν΄λΌμ°λ μΆλ ₯
λκΈ