어쩌다가 엄청나게 길어진 남미 여행!
내 스스로도 잘 정리가 안되서 쭉 일정을 그려보았다
이번 학기도(?) 잘 마무리하고 잘 다녀와야지 ><!
어쩌다가 엄청나게 길어진 남미 여행!
내 스스로도 잘 정리가 안되서 쭉 일정을 그려보았다
이번 학기도(?) 잘 마무리하고 잘 다녀와야지 ><!
2020년 1월 12일에 본 해질 무렵의 Carmel Beach
시간의 흐름이 돌연히 선명해지는 순간!
LDA Topic Model output (20 Topics):
Topic #13 and #17 can be interpreted as “armed provocation” and “nuclear provocation”, respectively. Each of #14 and #18 can be interpreted as “South-North Dialogue” and “international talks”.
The numbers of articles that belong to each category are shown in the graph below
Independent variable: the inverse degree of support for the unification of the people (1: necessary / 5: unnecessary)
CODE:
import re import os import sys import pandas as pd import numpy as np import pandas as pd from pprint import pprint import random import gensim import gensim.corpora as corpora from konlpy.tag import Twitter from operator import itemgetter import datetime as dt import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import warnings warnings.filterwarnings("ignore",category=DeprecationWarning) data = [] dirname = 'D://nk//data' header = {} k = 0 for f_dir in os.listdir(dirname): for fname in os.listdir(os.path.join(dirname, f_dir)): k += 1 if k % 10000 == 0: print(k) f = open(os.path.join(dirname, f_dir, fname), 'r', encoding = 'utf-8') data.append([f_dir, f.read()]) f.close() #Document specific Preprocessing a1 = re.compile('등록\s*\:\s*(\d{4}\s*\-\s*\d{2}\s*\-\s*\d{2})') a2 = re.compile('입력\s*(\d{4}\s*[\.\-]\s*\d{2}\s*[\.\-]\s*\d{2})') a2_2 = re.compile('(\d{4}\s*[\.\-]\s*\d{2}\s*[\.\-]\s*\d{2})') a2_3 = re.compile('등록\s*\:\(\d{4}\-\d{2}\-\d{2})') a3 = re.compile('((?:19|20)\d{6})') a4 = re.compile('(\d{4}\s*\-\s*\d{2}\s*\-\s*\d{2})') for item in data: k = 0 for a in [a1, a2, a2_2, a2_3, a4, a3]: if a.search(item[1]): date = re.sub('[\s\-\.]', '', a.search(item[1])[1]) k = 1 data_date.append([item[0], date, item[1]]) if len(str(date)) != 8: print(item[1][:100]) break #if k == 0 : #print(item[1][:30]) twitter = Twitter() def sent_to_words(sentences): return twitter.morphs(sentences) # deacc=True removes punctuations def remove_stopwords(texts): return [[word for word in preprocess(str(doc)) if word not in stopwords] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def preprocess(doc): doc = re.sub('\s+', ' ', doc) doc = re.sub('[A-Za-z]+[0-9]+', '', doc) doc = re.sub('[a-zA-Z]+', ' ', doc) doc = re.sub('\s+', ' ', doc) return doc #Remove all the one character words except for the name of country country_list = ['미', '북', '러', '중', '일', '한', '군', '핵', '당', '말', '남'] data = [[re.sub('[^가-힣\s\_]', '', word) for word in item] for item in data] data = [[word for word in item if (len(word) > 1) or (word in country_list)] for item in data] data_words = list(zip(header, data)) #Remove document specific stopwords stop_words = ['아티클', '중앙일보', '조선일보', '동아일보', '한겨레', '구독', '관련기사', '아티', '클관련', '추가', '지면보기', '종합', '뉴스', '사진', '밝혔', '이라고', '등록', '라고', '라며', '내용', '보다', '경우', '지역', '위해', '이라는', '그런', '처럼', '이나', '같은', '보다', '는데', '다면', '그것', '이제', '때문', '다시', '많은', '정도', '일이', '없었', '되었', '인가', '않는', '베스트추천', '기자', '수정'] data_words = [[item[0], item[1], [word for word in item[2] if word not in stop_words]] for item in data_words] def get_topic(txt): corpus = id2word.doc2bow(txt) topic = list(lda_model.get_document_topics(corpus)) return sorted(topic, key=itemgetter(1))[-1][0] #return topics and the corresponding words and weights. lda_model.print_topics() #Drawing Graph result = [[item[0], item[1], get_topic(item[2])] for item in data_words] df = pd.DataFrame(result) df.columns = ['news', 'date', 'topic'] df.date = df.date.astype(int) df = df[df.date >= 199501] #Remove errors in date information for i in range(len(df)): if int(str(df['date'].iloc[i])[4:6]) > 12 or int(str(df['date'].iloc[i])[4:6]) == 0: print(i, df['date'].iloc[i]) for i in range(len(df)): try: x = df['date'].iloc[i] dt.date(int(str(x)[0:4]), int(str(x)[4:6]), int(str(x)[6:])) except: print(i, df['date'].iloc[i]) df.drop(df.index[[42981, 43438]], inplace = True) df.drop(df.index[[65986, 74283]], inplace = True) #Process date information df['date'] = df['date'].apply(lambda x: dt.date(int(str(x)[0:4]), int(str(x)[4:6]), int(str(x)[6:]))) df['date'] = pd.to_datetime(df['date']) #Adjust sample ratio by putting different weight. #day count_d = df.groupby(['date', 'topic']).size().reset_index(name = 'count') count_d['adj'] = count_d['count'] mask1 = (count_d['date'] < dt.date(2005, 1, 1)) mask2 = (dt.date(2005, 1, 1) < count_d['date']) & (count_d['date']< dt.date(2009, 10, 17)) count_d.loc[mask1, 'adj'] = count_d.loc[mask1, 'count'] * 2 count_d.loc[mask2, 'adj'] = count_d.loc[mask2, 'count'] * (4/3) #month df_ = df.copy() df_.reset_index() df_['date'] = pd.to_datetime(df_['date']) df_.set_index('date', inplace = True) df_ = df_.to_period('M').to_timestamp('M') count_m = df_.groupby(['date', 'topic']).size().reset_index(name = 'count') count_m['adj'] = count_m['count'] mask1 = (count_m['date'] < dt.date(2005, 1, 1)) mask2 = (dt.date(2005, 1, 1) < count_m['date']) & (count_m['date']<dt>= dt.date(2000,1,1)) & (count_d.date = dt.date(2000,1,1)) & (count_m.date = dt.date(2000,1,1)) & (count_y.date = dt.date(2018, 1, 1)) count_y.loc[mask3, 'adj'] = count_y.loc[mask3, 'count'] * 6/5 else: return None #color = '#00BFFF' x = xy['date'] if adj: y = xy['adj'] else: y = xy['count'] #ax.plot(x, y, alpha = 0.8, c = color, linewidth=1.3) #print(label, topic) ax.plot(x, y, alpha = 0.7, linewidth=1.3, label = 'topic # : %s(%s)'%(label, str(topic))) ax.set_xlim(dt.date(2000, 1,1), dt.date(2018, 12, 31)) #ax.set_xlim(min(x), max(x)) #ax.set_title("Spread and GDP", fontsize = 20) ax.set_xlabel('year', fontsize = 20) ax.set_ylabel('Number of Ariticles' , fontsize= 24) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.grid(color='grey', linestyle='-', linewidth=0, alpha = 1) #ax.set_xticks([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017]) ax.tick_params(axis = 'both', labelsize = 17) ax.legend() #if period == 'Y': # plt.xticks(np.arange(dt.date(2000, 12, 31), dt.date(2018, 12, 31), dt.timedelta(731))) #else: plt.xticks(np.arange(dt.date(2000, 1,1), dt.date(2018, 12, 31), dt.timedelta(731))) plt.show() #Draw Graphs (Day, Month, Year frequency on provocation and peace topics respectively) dp([12, 16], ['provocation', 'nuclear'], 'D', adj = True) dp([13, 14], ['South-North', 'Global'], 'D', adj = True) dp([12, 16], ['provocation', 'nuclear'], 'M', adj = True) dp([13, 14], ['South-North', 'Global'], 'M', adj = True) dp([12, 16], ['provocatoin', 'nuclear'], 'Y', adj = True) dp([13, 14], ['South-North', 'US-North'], 'Y', adj = True) #Number of Total Articles plt.style.use('seaborn-whitegrid') fig, ax = plt.subplots(figsize = (9.5, 6.5), dpi = 100) xy = df.groupby(['date']).size().reset_index(name = 'count') xy = xy[xy['date'] > dt.date(1995, 1, 1)] #color = '#00BFFF' x = xy['date'] y = xy['count'] #ax.plot(x, y, alpha = 0.8, c = color, linewidth=1.3<code>) #print(label, topic) ax.plot(x, y, alpha = 0.8, linewidth=1.3, label = 'total') ax.set_xlim(min(x), max(x)) #ax.set_title("Spread and GDP", fontsize = 20) ax.set_xlabel('year', fontsize = 20) ax.set_ylabel('Number of Ariticles' , fontsize= 24) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.grid(color='grey', linestyle='-', linewidth=0, alpha = 1) #ax.set_xticks([1995, 1997, 1999, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017]) #ax.tick_params(axis = 'both', labelsize = 17) ax.legend() plt.xticks(np.arange(min(x), max(x), dt.timedelta(730))) plt.show()
I recall that installing Caffe on Window was one of the hardest steps on this project.
/******************************************************************
— Title : [Python; NetworkX] Supply Chain analysis
— Key word : networkx, Node, Edge, Centrality, Supply Chain, Value Chain
*******************************************************************/
Data
Graph
Sample Code :
(Reference : https://briandew.wordpress.com/2016/06/15/trade-network-analysis-why-centrality-matters/)
import networkx as nx import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt def draw_G(G, year): oc = nx.out_degree_centrality(G) for key in oc.keys(): oc[key] = oc[key]*10 nx.set_node_attributes(G, name= 'cent', values = oc) ic = nx.in_degree_centrality(G) nx.set_node_attributes(G, name= 'in', values = ic) node_size = [float(G.node[v]['in'])*20000 + 1 for v in G] node_color = [float(G.node[v]['cent']) for v in G] pos = nx.spring_layout(G, k=30, iterations=8) nodes = nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color = node_color, alpha=0.5) #nodes = nx.draw_networkx_nodes(G, pos, node_color=node_color, alpha=0.5) edges = nx.draw_networkx_edges(G, pos, edge_color='black', arrows=True, width=0.3) nx.draw_networkx_labels(G, pos, font_size=5) plt.text(0,-1.2, 'Node color is out_degree_centrality', fontsize=7) plt.title('Compustat firms Supply Chain (year : ' + str(year) + ')', fontsize=12) cbar = plt.colorbar(mappable=nodes, cax=None, ax=None, fraction=0.015, pad=0.04) cbar.set_clim(0, 1) plt.margins(0,0) plt.axis('off') plt.savefig(str(year)+ 'Supply Chain.png', dpi=1000) plt.show()
Numbers(Statistics)