2.2. Topic modelling med gensim og dhlab#
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537
Finn en bok
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid | urn | title | authors | oaiid | sesamid | isbn10 | city | timestamp | year | publisher | langs | subjects | ddc | genres | literaryform | doctype | ocr_creator | ocr_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100443356 | URN:NBN:no-nb_digibok_2008071500119 | Norgeshistorie for realskolen | Jensen , Magnus | oai:nb.bibsys.no:999825090494702202 | cc8bff9c30101cd12ecf5d436e91d30d | Oslo | 19530101 | 1953 | nob | Uklassifisert | digibok | nb | 20060101 |
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2008071500119'
2.2.1. Chunking#
# Chunks
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
62
def chunks_to_corpus(chunks_list):
res = []
for x in chunks_list:
inner_res = ""
for y in x:
inner_res += (y + " ") * x[y]
res.append(inner_res)
return res
texts = chunks_to_corpus(res.chunks)
2.2.2. Find delta TFIDF#
df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
! | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
" | 5.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | ... | 3.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
% | 2.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
& | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
' | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 | ... | 0.0 | 0.0 | 0.0 | 5.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
utkastet | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
utsettingsforslag | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
velte | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
venstreregjeringen | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
vidtgående | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
9259 rows × 62 columns
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) / tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
årh. | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 2.0 | ... | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
riker | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 2.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 |
adel | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Valdemar | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
valte | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
vernepliktige | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
blodig | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
dyrkede | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
874 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
hvo | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1000 rows × 62 columns
outer_lst = []
for col in df.loc[target_tokens].columns:
inner_str = ""
for i, x in df.loc[target_tokens][col].items():
if x > 0:
inner_str += (i + " ") * int(x)
outer_lst.append(inner_str)
len(outer_lst)
62
2.2.3. Prep for LDA#
data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]
2.2.4. Make model#
lda_model = gensim.models.LdaMulticore(
corpus = corpus,
id2word = id2word,
num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
'0.017*"bøndene" + 0.014*"kongen" + 0.012*"gav" + 0.012*"skip" + '
'0.011*"Danmark" + 0.010*"satte" + 0.009*"konge" + 0.009*"Magnus" + '
'0.008*"danske" + 0.008*"årh."'),
(1,
'0.023*"Stortinget" + 0.020*"bøndene" + 0.020*"Karl" + 0.012*"Sverdrup" + '
'0.012*"gav" + 0.011*"Sverige" + 0.011*"Russland" + 0.008*"satte" + '
'0.008*"Gustav" + 0.008*"Finnland"'),
(2,
'0.023*"Danmark" + 0.019*"danske" + 0.017*"kongen" + 0.015*"Fredrik" + '
'0.014*"gav" + 0.011*"konge" + 0.011*"årh." + 0.010*"Kristian" + '
'0.010*"Sverige" + 0.009*"Stortinget"'),
(3,
'0.018*"Danmark" + 0.017*"Sverige" + 0.017*"Kristian" + 0.015*"gav" + '
'0.013*"Fredrik" + 0.012*"konge" + 0.012*"Stortinget" + 0.011*"satte" + '
'0.011*"Kongen" + 0.011*"kongen"'),
(4,
'0.029*"Sverige" + 0.026*"gav" + 0.022*"Gustav" + 0.011*"bøndene" + '
'0.011*"igjennom" + 0.010*"Danmark" + 0.010*"Russland" + 0.010*"Karl" + '
'0.010*"Adolf" + 0.009*"rike"'),
(5,
'0.028*"Sverige" + 0.022*"Kristian" + 0.019*"konge" + 0.019*"danske" + '
'0.018*"Danmark" + 0.016*"Karl" + 0.016*"bøndene" + 0.016*"kongen" + '
'0.015*"gav" + 0.014*"svenske"'),
(6,
'0.038*"Sverige" + 0.022*"Danmark" + 0.021*"Kristian" + 0.019*"svenske" + '
'0.018*"Karl" + 0.013*"bøndene" + 0.012*"Stortinget" + 0.010*"gav" + '
'0.009*"tyskerne" + 0.009*"Magnus"'),
(7,
'0.022*"bøndene" + 0.021*"Kristian" + 0.015*"Danmark" + 0.014*"satte" + '
'0.012*"gav" + 0.010*"Sverige" + 0.010*"allierte" + 0.010*"Karl" + '
'0.009*"kongen" + 0.009*"Fredrik"'),
(8,
'0.011*"Sverige" + 0.010*"bøndene" + 0.010*"Stortinget" + 0.009*"NS" + '
'0.009*"kongen" + 0.009*"organisasjonene" + 0.009*"svenske" + '
'0.009*"unionen" + 0.008*"Quisling" + 0.008*"prester"'),
(9,
'0.032*"Danmark" + 0.019*"danske" + 0.018*"Håkon" + 0.017*"Sverige" + '
'0.016*"kongen" + 0.014*"gav" + 0.013*"regjering" + 0.012*"Stortinget" + '
'0.012*"bøndene" + 0.010*"Fredrik"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)