2.2. Topic modelling med gensim og dhlab#

import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid urn title authors oaiid sesamid isbn10 city timestamp year publisher langs subjects ddc genres literaryform doctype ocr_creator ocr_timestamp
0 100443356 URN:NBN:no-nb_digibok_2008071500119 Norgeshistorie for realskolen Jensen , Magnus oai:nb.bibsys.no:999825090494702202 cc8bff9c30101cd12ecf5d436e91d30d Oslo 19530101 1953 nob Uklassifisert digibok nb 20060101
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2008071500119'

2.2.1. Chunking#

# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
62
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res
texts = chunks_to_corpus(res.chunks)

2.2.2. Find delta TFIDF#

df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 1 2 3 4 5 6 7 8 9 ... 52 53 54 55 56 57 58 59 60 61
! 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0
" 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 ... 3.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
% 2.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0
& 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0
' 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 ... 0.0 0.0 0.0 5.0 1.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
utkastet 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
utsettingsforslag 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
velte 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
venstreregjeringen 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
vidtgående 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0

9259 rows × 62 columns

# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 1 2 3 4 5 6 7 8 9 ... 52 53 54 55 56 57 58 59 60 61
årh. 0.0 0.0 0.0 1.0 2.0 0.0 1.0 0.0 0.0 2.0 ... 1.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
riker 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 2.0 ... 0.0 0.0 0.0 0.0 1.0 1.0 0.0 2.0 0.0 0.0
adel 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
Valdemar 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
valte 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
vernepliktige 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
blodig 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
dyrkede 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
874 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
hvo 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

1000 rows × 62 columns

outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)
len(outer_lst)
62

2.2.3. Prep for LDA#

data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]

2.2.4. Make model#

lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
  '0.017*"bøndene" + 0.014*"kongen" + 0.012*"gav" + 0.012*"skip" + '
  '0.011*"Danmark" + 0.010*"satte" + 0.009*"konge" + 0.009*"Magnus" + '
  '0.008*"danske" + 0.008*"årh."'),
 (1,
  '0.023*"Stortinget" + 0.020*"bøndene" + 0.020*"Karl" + 0.012*"Sverdrup" + '
  '0.012*"gav" + 0.011*"Sverige" + 0.011*"Russland" + 0.008*"satte" + '
  '0.008*"Gustav" + 0.008*"Finnland"'),
 (2,
  '0.023*"Danmark" + 0.019*"danske" + 0.017*"kongen" + 0.015*"Fredrik" + '
  '0.014*"gav" + 0.011*"konge" + 0.011*"årh." + 0.010*"Kristian" + '
  '0.010*"Sverige" + 0.009*"Stortinget"'),
 (3,
  '0.018*"Danmark" + 0.017*"Sverige" + 0.017*"Kristian" + 0.015*"gav" + '
  '0.013*"Fredrik" + 0.012*"konge" + 0.012*"Stortinget" + 0.011*"satte" + '
  '0.011*"Kongen" + 0.011*"kongen"'),
 (4,
  '0.029*"Sverige" + 0.026*"gav" + 0.022*"Gustav" + 0.011*"bøndene" + '
  '0.011*"igjennom" + 0.010*"Danmark" + 0.010*"Russland" + 0.010*"Karl" + '
  '0.010*"Adolf" + 0.009*"rike"'),
 (5,
  '0.028*"Sverige" + 0.022*"Kristian" + 0.019*"konge" + 0.019*"danske" + '
  '0.018*"Danmark" + 0.016*"Karl" + 0.016*"bøndene" + 0.016*"kongen" + '
  '0.015*"gav" + 0.014*"svenske"'),
 (6,
  '0.038*"Sverige" + 0.022*"Danmark" + 0.021*"Kristian" + 0.019*"svenske" + '
  '0.018*"Karl" + 0.013*"bøndene" + 0.012*"Stortinget" + 0.010*"gav" + '
  '0.009*"tyskerne" + 0.009*"Magnus"'),
 (7,
  '0.022*"bøndene" + 0.021*"Kristian" + 0.015*"Danmark" + 0.014*"satte" + '
  '0.012*"gav" + 0.010*"Sverige" + 0.010*"allierte" + 0.010*"Karl" + '
  '0.009*"kongen" + 0.009*"Fredrik"'),
 (8,
  '0.011*"Sverige" + 0.010*"bøndene" + 0.010*"Stortinget" + 0.009*"NS" + '
  '0.009*"kongen" + 0.009*"organisasjonene" + 0.009*"svenske" + '
  '0.009*"unionen" + 0.008*"Quisling" + 0.008*"prester"'),
 (9,
  '0.032*"Danmark" + 0.019*"danske" + 0.018*"Håkon" + 0.017*"Sverige" + '
  '0.016*"kongen" + 0.014*"gav" + 0.013*"regjering" + 0.012*"Stortinget" + '
  '0.012*"bøndene" + 0.010*"Fredrik"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)