%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wordcloud, datetime
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Thu Jun 10 16:32:11 2021


plt.rcParams['figure.dpi'] = 100
pd.options.display.max_columns = 30


df = pd.read_csv('data/survey_responses.csv')
df.columns = ['time','definition','context','capability','expertise','questions','position','department','email']

stopwords=['data','and','I','we','they','them','of','for','to','the','in','on','at','from','it','that','this',
           'is','are','am','be','a','with','my','than','as','but','also','not','non']
for col in ['context', 'capability', 'expertise', 'questions']:
    swords = stopwords + [col,]
    wc = wordcloud.WordCloud(background_color='white', width=600, height=400, colormap=plt.cm.tab10,
                             stopwords=swords, random_state=42).generate(' '.join(df[col].dropna()))
    fig, ax = plt.subplots(figsize=(9,6))
    _ = ax.imshow(wc)
    _ = ax.axis('off')
    _ = ax.set_title(col.title())
    fname = 'figures/wordcloud_{}.png'.format(col)
    fig.savefig(fname,dpi=600,bbox_inches='tight')


df_combined = pd.concat([df.context.dropna(), df.capability.dropna(), df.expertise.dropna(), df.questions.dropna()], ignore_index=True)
combined = [s for s in df_combined]
print('Total of {} responses'.format(len(combined)))

Total of 103 responses


import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def get_pos_tag(s):
    """ Map POS tag to first character lemmatize() accepts """
    tag = s[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def custom_tokenizer(doc):
    return [lemmatizer.lemmatize(w, get_pos_tag(s)) for w, s in nltk.pos_tag(nltk.word_tokenize(doc))]

responses = [' '.join(custom_tokenizer(s)) for s in combined]


from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

stopwords = ENGLISH_STOP_WORDS.union(set(['data','expertise','question','whoi']))
vectorizer_tfidf = TfidfVectorizer(stop_words=stopwords).fit(responses)
X_tfidf = vectorizer_tfidf.transform(responses)
print('X_tfidf shape:{}'.format(X_tfidf.shape))

X_tfidf shape:(103, 620)


tot_freq = X_tfidf.toarray().sum(axis=0)
sort_by_freq = np.argsort(tot_freq)[::-1]
feature_names = np.array(vectorizer_tfidf.get_feature_names())
print('Words with high tf-idf (decreasing order ):\n{}'.format(feature_names[sort_by_freq[:100]]))

Words with high tf-idf (decreasing order ):
['tool' 'learning' 'machine' 'different' 'need' 'big' 'ocean' 'analysis'
 'make' 'volume' 'learn' 'large' 'computational' 'help' 'high' 'new'
 'deep' 'method' 'idea' 'use' 'type' 'integration' 'project' 'research'
 'good' 'file' 'time' 'chemical' 'expert' 'practice' 'instrument'
 'process' 'model' 'gpus' 'software' 'best' 'format' 'develop' 'cloud'
 'house' 'management' 'training' 'satellite' 'create' 'funding' 'tools'
 'ai' 'resource' 'biological' 'gene' 'image' 'education' 'poseidon'
 'approach' 'work' 'available' 'preliminary' 'ability' 'way' 'pipeline'
 'datatypes' 'applicable' 'apply' 'qc' 'development' 'access' 'technical'
 'efficiency' 'set' 'grant' 'multiple' 'faculty' 'cross' 'recognition'
 'ml' 'automate' 'control' 'share' 'update' 'address' 'people' 'like'
 'standardization' 'current' 'climate' 'simulation' 'online' 'decade'
 'community' 'scientist' 'quality' 'specie' 'scale' 'skill' 'situ' 'focus'
 'measuremetns' 'predictability' 'earth' 'turbulence']


from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

n_topics = 10 # number of topics to extract
nmf = NMF(n_components=n_topics,random_state=42,init='nndsvd',max_iter=500)
X_topics = nmf.fit_transform(X_tfidf)
X_topics = normalize(X_topics, norm='l1') # normalize NMF weights to unit length (topic probabilities)

topics = np.argmax(X_topics,axis=1) + 1


sort_by_feature = np.argsort(nmf.components_, axis=1)[:,::-1]
topic_names_r = np.array(['{:2d} {}'.format(n+1, ' '.join(s))
                        for n, s in enumerate(feature_names[sort_by_feature[:,:10]])])
topic_freq = X_topics.sum(axis=0)/X_topics.sum()

for tn, tf in zip(topic_names_r, topic_freq):
    print('{:107s} {:4.1f}%'.format(tn, tf*100))

 1 different applicable datatypes help type glue expert generalize tool instrument                           6.5%
 2 practice best make management software resource version tools people control                             14.1%
 3 learn big ai machine analysis specialist analyst apply certain appropriate                                9.6%
 4 integration chemical biological good solution multi omics space understand time                           5.9%
 5 deep learning gpus investment poseidon sensor network mixing dedicated address                            4.4%
 6 computational preliminary faculty method funding hiring grant principle generation proof                  7.7%
 7 volume high image file isterabytes genetic occasionally frequent remote velocity                          9.0%
 8 new research project idea focus available way balance service need                                       16.0%
 9 tool cloud set manipulation large machine learning storage develop acoustic                              12.3%
10 ocean satellite process decade observe extract key century reconstruct temperature                       14.4%


n_top_words = 20 # number of words to show per topic
sort_by_feature = np.argsort(nmf.components_, axis=1)[:,::-1]
cols = ['topic {}'.format(n+1) for n in range(nmf.components_.shape[0])]
df_topics = pd.DataFrame(feature_names[sort_by_feature][:,:n_top_words].transpose(),columns=cols)
df_topics


feature_names = np.array(vectorizer_tfidf.get_feature_names())
nmf_components_sorted = np.take_along_axis(nmf.components_, sort_by_feature, axis=1)
feature_names_sorted = feature_names[sort_by_feature]

word_freq = np.round(nmf_components_sorted * 5).astype(int)
topics = {}

for t in range(1,11):
    topics[t] = []
    for n in range(len(feature_names)):
        if word_freq[t-1,n] > 0:
            topics[t] = topics[t] + [feature_names_sorted[t-1,n]]

fig, axs = plt.subplots(2, 5, figsize=(14,6))
fig.subplots_adjust(wspace=0.05, hspace=0.05)
for ax, t in zip(axs.ravel(), range(1,22)):
    wc = wordcloud.WordCloud(background_color='white', width=500, height=500, colormap=plt.cm.tab10,
                         random_state=42, relative_scaling=0.5).generate(' '.join(topics[t]))
    _ = ax.imshow(wc)
    _ = ax.axis('off')
    _ = ax.set_title('Topic {}'.format(t), fontsize=9)

fig.savefig('figures/wordcloud_topics.png',dpi=600,bbox_inches='tight')

Big data collaboratory feedback results¶

Word cloud plots (one for each question)¶

Topic modeling¶

Apply lemmatization to responses¶

Convert list of responses to bag-of-words matrix using tf-idf scaling¶

Extract 20 topics using Non-negative Matrix Factorization (NMF)¶

Word cloud plot of topics¶

	topic 1	topic 2	topic 3	topic 4	topic 5	topic 6	topic 7	topic 8	topic 9	topic 10
0	different	practice	learn	integration	deep	computational	volume	new	tool	ocean
1	applicable	best	big	chemical	learning	preliminary	high	research	cloud	satellite
2	datatypes	make	ai	biological	gpus	faculty	image	project	set	process
3	help	management	machine	good	investment	method	file	idea	manipulation	decade
4	type	software	analysis	solution	poseidon	funding	isterabytes	focus	large	observe
5	glue	resource	specialist	multi	sensor	hiring	genetic	available	machine	extract
6	expert	version	analyst	omics	network	grant	occasionally	way	learning	key
7	generalize	tools	apply	space	mixing	principle	frequent	balance	storage	century
8	tool	people	certain	understand	dedicated	generation	remote	service	develop	reconstruct
9	instrument	control	appropriate	time	address	proof	velocity	need	acoustic	temperature
10	machine	statistic	approach	process	machine	internal	sense	current	model	large
11	learning	citable	like	impact	innovative	distribute	veracity	gene	format	met
12	need	use	unsupervised	microorganism	advanced	developed	work	answer	analyze	situ
13	format	interpretation	seismic	linking	idea	sharable	big	complement	curate	lot
14	heterogeneous	automate	tool	discipline	ocean	supporting	recognition	discoverable	shared	visualization
15	include	guidance	use	examine	generalize	technical	collaborator	generate	important	download
16	radiation	opportunity	design	ocean	help	hire	standardization	address	matching	sst
17	salinity	promotion	forecast	instrument	transport	support	collect	fund	input	atmospheric
18	ship	code	interesting	need	develop	analysis	variety	datasets	climate	sss
19	common	knowledge	classification	ooi	glacial	expert	quality	method	simulation	pco2