신문기사 군집분석 실습

저번 글에 이어서 파이썬 군집분석을 해보겠다.

사실 이번 데이터세트는 큰 의미는 없다

그 이유는 마지막에 설명하겠다.

import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline import pickle import string import pandas as pd from datetime import datetime import numpy as np import scipy as sp import sklearn import sys from nltk.corpus import stopwords import nltk from nltk.stem import WordNetLemmatizer, SnowballStemmer from nltk.stem.porter import * from nltk import word_tokenize from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation from sklearn.decomposition import NMF from sklearn.preprocessing import normalize from scipy import linalg import gensim from gensim.utils import simple_preprocess from gensim.parsing.preprocessing import STOPWORDS

갖가지 파이썬 패키지들을 import 해준다

# 데이터 불러오기 df_c = pd.read_csv(r"C:\Users\kingn\naver_news_contacttracing.csv", usecols=[2,3,4,5,6,7]) df_p = pd.read_csv(r"C:\Users\kingn\naver_news_patientspread.csv",usecols=[1,2,3,4,5,6])

윈도우를 쓰는 사람들은 "경로복사"를 클릭해면 자동으로 파일 경로를 붙여올 수 있다.

그런데 경로복사를 통해서 파일 경로를 갖고 오면 에러가 뜬다

그러면 앞에 r 을 붙여주면 된다

reverse 의 약자 같은데... 나도 정말 많이 실수한 부분이다

아니면 역슬래쉬 부분을 \ 바꿔주면 된다

이게 왜 화딱지가 가끔 나냐면 이게 안되면 이 다음 작업 수행이 안된다

df_c.head()

데이터 상태를 확인해준다

df_p.head()

기사를 2개의 데이터세트에서 가져 왔으므로 합쳐준다.

#df_c와 df_p를 합친다. df = pd.concat([df_c, df_p], axis = 0, ignore_index = True) #아래에 이어붙여서 axis가 0 df.reset_index(drop=True, inplace=True)

df.tail()

#outlet에 포함되는 값들을 중복 없이 나열하는 함수 df.outlet.unique()

df.outlet.value_counts()

신문 논조가 비슷한 것으로 나눠준다

conservative = df[df.outlet.str.contains("조선일보|중앙일보|동아일보")] liberal = df[df.outlet.str.contains("한겨레|경향신문")] len(conservative), len(liberal)

# text에서 빈값(null)일 경우가 얼마나 있는지 확인 print(len(df[df['text'].isnull()]))

df[df['text'].isnull()]

df = df[~df['text'].isnull()]

#필요없는 문장부호,특수문자,이메일 제거 def clean_text(text): text = re.sub('http[s]?://\S+', '', text) # http url 제거 text = re.sub('\S*@\S*\s?', '', text) # 기자 emails 제거 text = re.sub(r'\[.*?\]', '', text) # 대괄호안에 텍스트 제거 : 뉴스이름 + 기자이름 text = re.sub(r'\w*\d\w*', '', text) # 숫자 포함하는 텍스트 제거 text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)#특수문자 이모티콘 제거 text = re.sub("\n", '', text) #개행문자 제거 text = re.sub("\xa0", '', text) #개행문자 제거 text = re.sub(r'Copyright .* rights reserved', '', text) # "Copyright all rights reserved" 제거 return text

df['text_clean'] = pd.DataFrame(df.text.apply(lambda x: clean_text(x)))

df.head()

df['text_clean']

df['cleaned_text'] = df['text'].apply(clean_text)

from konlpy.tag import Okt okt=Okt() text = [okt.morphs(doc) for doc in df['cleaned_text']]

text_noun = [okt.nouns(doc) for doc in df['cleaned_text']]

import json with open('stopwords-ko.json', encoding='UTF8') as f: stopwords = json.load(f) stopwords.extend(['에서', '으로', '이다', '는', '한', '은', '씨', 'A', '서', '다', '했다', '하는', 'ㆍ']) df['text_tokenized'] = text_noun df['text_tokenized'] = df['text_tokenized'].apply(lambda x: [a for a in x if a not in stopwords]) df.head()

TF-IDF

data1 = df['text_tokenized'].tolist() df = df[:500] data = [] for i in data1: string = ' '.join(i) data.append(string) data[2] from collections import defaultdict vectorizer = TfidfVectorizer() sp_matrix = vectorizer.fit_transform(data) word2id = defaultdict(lambda : 0) for idx, feature in enumerate(vectorizer.get_feature_names_out()): word2id[feature] = idx tfidf = [] for i, sent in enumerate(data): print('===== document[%d] =====' %i) print([(token, sp_matrix[i, word2id[token]]) for token in sent.split()]) tfidf.append([(token, sp_matrix[i, word2id[token]]) for token in sent.split()]) for i in tfidf: tfidf_token = [] tfidf_vector = [] i = list(set(i)) # set으로 중복되는 단어를 없애주고 리스트로 만들기 for w in i: tfidf_token.append(w[0]) # 단어 저장 tfidf_vector.append(w[1]) # tf-idf 벡터 저장 show = {"token" : tfidf_token, "vector" : tfidf_vector} show = pd.DataFrame(show) show = show.sort_values(by = ["vector"], ascending = False) print(show.head(10))

### 토픽모델링- LDA #토픽모델링과 클러스터링의 차이는? #토픽모델링: 등장확률기반으로 추론 (출현빈도를 바탕으로) #클러스터링: 임베딩벡터를 기반으로 기하적으로 풀어내는 방식 from gensim import corpora, models dictionary = corpora.Dictionary(data1) corpus = [dictionary.doc2bow(text) for text in data1] corpus[0:1] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary) #num_topics로 토픽 개수를 조절 ldamodel.print_topics(num_words=10) ldamodel.get_document_topics(corpus)[-1]

이 기사들은 교육적인 목적이라서

전염병 관련된 기사들을 가지고 2개로 나눠서 파이썬으로 분석했다.

기사 신문사별로 어떤 차이가 있을까?

과거 조국사태, 청와대 개방, 용산 이전

이런 주제로 크롤링해서 보면 아마 신문사별로 논조가 달라서

재미있는 주제가 나오지 않을까 싶다.