Created by Ivan Lima on Mon Jun 7 2021 12:16:15 -0400
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wordcloud, datetime
print('Last updated on {}'.format(datetime.datetime.now().ctime()))
Last updated on Thu Jun 10 16:32:11 2021
plt.rcParams['figure.dpi'] = 100
pd.options.display.max_columns = 30
df = pd.read_csv('data/survey_responses.csv')
df.columns = ['time','definition','context','capability','expertise','questions','position','department','email']
stopwords=['data','and','I','we','they','them','of','for','to','the','in','on','at','from','it','that','this',
'is','are','am','be','a','with','my','than','as','but','also','not','non']
for col in ['context', 'capability', 'expertise', 'questions']:
swords = stopwords + [col,]
wc = wordcloud.WordCloud(background_color='white', width=600, height=400, colormap=plt.cm.tab10,
stopwords=swords, random_state=42).generate(' '.join(df[col].dropna()))
fig, ax = plt.subplots(figsize=(9,6))
_ = ax.imshow(wc)
_ = ax.axis('off')
_ = ax.set_title(col.title())
fname = 'figures/wordcloud_{}.png'.format(col)
fig.savefig(fname,dpi=600,bbox_inches='tight')
Combine responses to all questions into one dataset.
df_combined = pd.concat([df.context.dropna(), df.capability.dropna(), df.expertise.dropna(), df.questions.dropna()], ignore_index=True)
combined = [s for s in df_combined]
print('Total of {} responses'.format(len(combined)))
Total of 103 responses
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def get_pos_tag(s):
""" Map POS tag to first character lemmatize() accepts """
tag = s[0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def custom_tokenizer(doc):
return [lemmatizer.lemmatize(w, get_pos_tag(s)) for w, s in nltk.pos_tag(nltk.word_tokenize(doc))]
responses = [' '.join(custom_tokenizer(s)) for s in combined]
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
stopwords = ENGLISH_STOP_WORDS.union(set(['data','expertise','question','whoi']))
vectorizer_tfidf = TfidfVectorizer(stop_words=stopwords).fit(responses)
X_tfidf = vectorizer_tfidf.transform(responses)
print('X_tfidf shape:{}'.format(X_tfidf.shape))
X_tfidf shape:(103, 620)
tot_freq = X_tfidf.toarray().sum(axis=0)
sort_by_freq = np.argsort(tot_freq)[::-1]
feature_names = np.array(vectorizer_tfidf.get_feature_names())
print('Words with high tf-idf (decreasing order ):\n{}'.format(feature_names[sort_by_freq[:100]]))
Words with high tf-idf (decreasing order ): ['tool' 'learning' 'machine' 'different' 'need' 'big' 'ocean' 'analysis' 'make' 'volume' 'learn' 'large' 'computational' 'help' 'high' 'new' 'deep' 'method' 'idea' 'use' 'type' 'integration' 'project' 'research' 'good' 'file' 'time' 'chemical' 'expert' 'practice' 'instrument' 'process' 'model' 'gpus' 'software' 'best' 'format' 'develop' 'cloud' 'house' 'management' 'training' 'satellite' 'create' 'funding' 'tools' 'ai' 'resource' 'biological' 'gene' 'image' 'education' 'poseidon' 'approach' 'work' 'available' 'preliminary' 'ability' 'way' 'pipeline' 'datatypes' 'applicable' 'apply' 'qc' 'development' 'access' 'technical' 'efficiency' 'set' 'grant' 'multiple' 'faculty' 'cross' 'recognition' 'ml' 'automate' 'control' 'share' 'update' 'address' 'people' 'like' 'standardization' 'current' 'climate' 'simulation' 'online' 'decade' 'community' 'scientist' 'quality' 'specie' 'scale' 'skill' 'situ' 'focus' 'measuremetns' 'predictability' 'earth' 'turbulence']
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
n_topics = 10 # number of topics to extract
nmf = NMF(n_components=n_topics,random_state=42,init='nndsvd',max_iter=500)
X_topics = nmf.fit_transform(X_tfidf)
X_topics = normalize(X_topics, norm='l1') # normalize NMF weights to unit length (topic probabilities)
topics = np.argmax(X_topics,axis=1) + 1
Topic number, 10 most frequent words in each topic and topic frequency
sort_by_feature = np.argsort(nmf.components_, axis=1)[:,::-1]
topic_names_r = np.array(['{:2d} {}'.format(n+1, ' '.join(s))
for n, s in enumerate(feature_names[sort_by_feature[:,:10]])])
topic_freq = X_topics.sum(axis=0)/X_topics.sum()
for tn, tf in zip(topic_names_r, topic_freq):
print('{:107s} {:4.1f}%'.format(tn, tf*100))
1 different applicable datatypes help type glue expert generalize tool instrument 6.5% 2 practice best make management software resource version tools people control 14.1% 3 learn big ai machine analysis specialist analyst apply certain appropriate 9.6% 4 integration chemical biological good solution multi omics space understand time 5.9% 5 deep learning gpus investment poseidon sensor network mixing dedicated address 4.4% 6 computational preliminary faculty method funding hiring grant principle generation proof 7.7% 7 volume high image file isterabytes genetic occasionally frequent remote velocity 9.0% 8 new research project idea focus available way balance service need 16.0% 9 tool cloud set manipulation large machine learning storage develop acoustic 12.3% 10 ocean satellite process decade observe extract key century reconstruct temperature 14.4%
20 most frequent words in each topic
n_top_words = 20 # number of words to show per topic
sort_by_feature = np.argsort(nmf.components_, axis=1)[:,::-1]
cols = ['topic {}'.format(n+1) for n in range(nmf.components_.shape[0])]
df_topics = pd.DataFrame(feature_names[sort_by_feature][:,:n_top_words].transpose(),columns=cols)
df_topics
topic 1 | topic 2 | topic 3 | topic 4 | topic 5 | topic 6 | topic 7 | topic 8 | topic 9 | topic 10 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | different | practice | learn | integration | deep | computational | volume | new | tool | ocean |
1 | applicable | best | big | chemical | learning | preliminary | high | research | cloud | satellite |
2 | datatypes | make | ai | biological | gpus | faculty | image | project | set | process |
3 | help | management | machine | good | investment | method | file | idea | manipulation | decade |
4 | type | software | analysis | solution | poseidon | funding | isterabytes | focus | large | observe |
5 | glue | resource | specialist | multi | sensor | hiring | genetic | available | machine | extract |
6 | expert | version | analyst | omics | network | grant | occasionally | way | learning | key |
7 | generalize | tools | apply | space | mixing | principle | frequent | balance | storage | century |
8 | tool | people | certain | understand | dedicated | generation | remote | service | develop | reconstruct |
9 | instrument | control | appropriate | time | address | proof | velocity | need | acoustic | temperature |
10 | machine | statistic | approach | process | machine | internal | sense | current | model | large |
11 | learning | citable | like | impact | innovative | distribute | veracity | gene | format | met |
12 | need | use | unsupervised | microorganism | advanced | developed | work | answer | analyze | situ |
13 | format | interpretation | seismic | linking | idea | sharable | big | complement | curate | lot |
14 | heterogeneous | automate | tool | discipline | ocean | supporting | recognition | discoverable | shared | visualization |
15 | include | guidance | use | examine | generalize | technical | collaborator | generate | important | download |
16 | radiation | opportunity | design | ocean | help | hire | standardization | address | matching | sst |
17 | salinity | promotion | forecast | instrument | transport | support | collect | fund | input | atmospheric |
18 | ship | code | interesting | need | develop | analysis | variety | datasets | climate | sss |
19 | common | knowledge | classification | ooi | glacial | expert | quality | method | simulation | pco2 |
feature_names = np.array(vectorizer_tfidf.get_feature_names())
nmf_components_sorted = np.take_along_axis(nmf.components_, sort_by_feature, axis=1)
feature_names_sorted = feature_names[sort_by_feature]
word_freq = np.round(nmf_components_sorted * 5).astype(int)
topics = {}
for t in range(1,11):
topics[t] = []
for n in range(len(feature_names)):
if word_freq[t-1,n] > 0:
topics[t] = topics[t] + [feature_names_sorted[t-1,n]]
fig, axs = plt.subplots(2, 5, figsize=(14,6))
fig.subplots_adjust(wspace=0.05, hspace=0.05)
for ax, t in zip(axs.ravel(), range(1,22)):
wc = wordcloud.WordCloud(background_color='white', width=500, height=500, colormap=plt.cm.tab10,
random_state=42, relative_scaling=0.5).generate(' '.join(topics[t]))
_ = ax.imshow(wc)
_ = ax.axis('off')
_ = ax.set_title('Topic {}'.format(t), fontsize=9)
fig.savefig('figures/wordcloud_topics.png',dpi=600,bbox_inches='tight')