λ³Έλ¬Έ λ°”λ‘œκ°€κΈ°
Python/ETC

[Python] 파이썬 κ°„λ‹¨ν•œ μ›Œλ“œν΄λΌμš°λ“œ 생성 :: 마이자λͺ½

by πŸŒ»β™š 2019. 1. 3.

넀이버 λ‰΄μŠ€ μ›Œλ“œν΄λΌμš°λ“œ

μ΄μ „κΈ€μ—μ„œ μž‘μ„±ν–ˆλ˜ 넀이버 λ‰΄μŠ€ 제λͺ©ν¬λ‘€λ§ν•œ λ‚΄μš©μ„ μ΄μš©ν•΄μ„œ ν˜•νƒœμ†ŒλΆ„μ„μ„ ν•˜μ—¬ κ°„λ‹¨ν•œ μ›Œλ“œ ν΄λΌμš°λ“œλ₯Ό λ§Œλ“€λ €κ³ ν•œλ‹€.
 
μ£Όμš” λͺ¨λ“ˆ
-BeautifulSoup : 크둀링

-requests : url get request

-Twitter : ν˜•νƒœμ†Œ 뢄석

-Counter :  ν˜•νƒœμ†Œ count

-WordCloud : μ›Œλ“œν΄λΌμš°λ“œ 생성

-matplotlib : μ›Œλ“œν΄λΌμš°λ“œ 좜λ ₯

 

from bs4 import BeautifulSoup
import requests
from konlpy.tag import Twitter
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

search_word = "μ‚Όμ„±"  # 검색어 지정
title_list = []

def get_titles(start_num, end_num):
    #start_num ~ end_numκΉŒμ§€ 크둀링
    while 1:
        if start_num > end_num:
            break
        print(start_num)

        url = 'https://search.naver.com/search.naver?where=news&sm=tab_jum&query={}&start={}'.format(search_word,start_num)
        req = requests.get(url)

        # 정상적인 request 확인
        if req.ok:
            html = req.text
            soup = BeautifulSoup(html, 'html.parser')

            # λ‰΄μŠ€μ œλͺ© λ½‘μ•„μ˜€κΈ°
            titles = soup.select(
                'ul.type01 > li > dl > dt > a'
            )

            # list에 λ„£μ–΄μ€€λ‹€
            for title in titles:
                title_list.append(title['title'])
        start_num += 10
    print(title_list)

def make_wordcloud(word_count):
    twitter = Twitter()

    sentences_tag = []
    #ν˜•νƒœμ†Œ λΆ„μ„ν•˜μ—¬ λ¦¬μŠ€νŠΈμ— λ„£κΈ°
    for sentence in title_list:
        morph = twitter.pos(sentence)
        sentences_tag.append(morph)
        print(morph)
        print('-' * 30)

    print(sentences_tag)
    print('\n' * 3)

    noun_adj_list = []
    #λͺ…사와 ν˜•μš©μ‚¬λ§Œ κ΅¬λΆ„ν•˜μ—¬ μ΄μŠ€νŠΈμ— λ„£κΈ°
    for sentence1 in sentences_tag:
        for word, tag in sentence1:
            if tag in ['Noun', 'Adjective']:
                noun_adj_list.append(word)

    #ν˜•νƒœμ†Œλ³„ count
    counts = Counter(noun_adj_list)
    tags = counts.most_common(word_count)
    print(tags)

    #wordCloud생성
    #ν•œκΈ€κΊ μ§€λŠ” 문제 ν•΄κ²°ν•˜κΈ°μœ„ν•΄ font_path 지정
    wc = WordCloud(font_path='/Library/Fonts/NanumSquareLight.ttf', background_color='white', width=800, height=600)
    print(dict(tags))
    cloud = wc.generate_from_frequencies(dict(tags))
    plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud)
    plt.show()

if __name__ == '__main__':
    #1~200λ²ˆκ²Œμ‹œκΈ€ κΉŒμ§€ 크둀링
    get_titles(1,200)

    #단어 30κ°œκΉŒμ§€ wordcloud둜 좜λ ₯
    make_wordcloud(30)​

 

 

 

 

 

 

μ›Œλ“œν΄λΌμš°λ“œ 좜λ ₯

 

 

λŒ“κΈ€