2.2. Topic modelling med gensim og dhlab#
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537
Finn en bok
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
| dhlabid | urn | title | authors | oaiid | sesamid | isbn10 | city | timestamp | year | publisher | langs | subjects | ddc | genres | literaryform | doctype | ocr_creator | ocr_timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100313140 | URN:NBN:no-nb_digibok_2016101748078 | Nordmenn før oss : norgeshistorie for gymnaset | Bull , Edvard | oai:nb.bibsys.no:999822590504702202 | 25dce22c432fc70670f9d4a3fdf34355 | Oslo | 19590101 | 1959 | Johan Grundt Tanum | nob | norge / historie | 948.1 | Faglitteratur | digibok | nb | 20060101 |
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2016101748078'
2.2.1. Chunking#
# Chunks
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
146
def chunks_to_corpus(chunks_list):
res = []
for x in chunks_list:
inner_res = ""
for y in x:
inner_res += (y + " ") * x[y]
res.append(inner_res)
return res
texts = chunks_to_corpus(res.chunks)
2.2.2. Find delta TFIDF#
df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ! | 1.0 | 0.0 | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 0.0 | 0.0 |
| ' | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 |
| ( | 3.0 | 1.0 | 1.0 | 2.0 | 2.0 | 2.0 | 2.0 | 1.0 | 1.0 | 0.0 | ... | 2.0 | 0.0 | 1.0 | 1.0 | 2.0 | 0.0 | 3.0 | 4.0 | 0.0 | 2.0 |
| ) | 11.0 | 6.0 | 6.0 | 7.0 | 4.0 | 2.0 | 7.0 | 11.0 | 10.0 | 5.0 | ... | 13.0 | 4.0 | 6.0 | 4.0 | 2.0 | 6.0 | 15.0 | 15.0 | 6.0 | 18.0 |
| * | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| uskåret | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| varefeller | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| vevnad | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| vevstolen | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| Ål | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
18248 rows × 146 columns
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) / tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| kongene | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| embetsmenn | 3.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | ... | 1.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| Bøndene | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 2.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| Grunnloven | 7.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| reformasjonen | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| fins | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| blokade | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| paragrafer | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| illegale | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| titelen | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1000 rows × 146 columns
outer_lst = []
for col in df.loc[target_tokens].columns:
inner_str = ""
for i, x in df.loc[target_tokens][col].items():
if x > 0:
inner_str += (i + " ") * int(x)
outer_lst.append(inner_str)
len(outer_lst)
146
2.2.3. Prep for LDA#
data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]
2.2.4. Make model#
lda_model = gensim.models.LdaModel(
corpus = corpus,
id2word = id2word,
num_topics = 10
)
pprint(lda_model.print_topics())
[(0,
'0.024*"Sverige" + 0.023*"Danmark" + 0.022*"bøndene" + 0.021*"konge" + '
'0.020*"kongen" + 0.019*"Kristian" + 0.018*"Stortinget" + 0.015*"bønder" + '
'0.014*"sjøl" + 0.014*"svenske"'),
(1,
'0.021*"bøte" + 0.018*"bøndene" + 0.015*"skip" + 0.011*"Danmark" + '
'0.011*"Kap" + 0.011*"kongen" + 0.009*"Sverige" + 0.009*"Stortinget" + '
'0.009*"Sverdrup" + 0.009*"svenske"'),
(2,
'0.056*"Kristian" + 0.024*"Fredrik" + 0.021*"kongen" + 0.021*"Sverige" + '
'0.020*"konge" + 0.016*"sjøl" + 0.014*"regjeringen" + 0.013*"Danmark" + '
'0.011*"krigen" + 0.010*"danske"'),
(3,
'0.021*"arbeiderne" + 0.014*"krigen" + 0.013*"Eder" + 0.012*"skip" + '
'0.012*"kongen" + 0.010*"Danmark" + 0.010*"byene" + 0.009*"sjøl" + '
'0.009*"Sverige" + 0.008*"borgere"'),
(4,
'0.030*"regjeringen" + 0.022*"Sverige" + 0.017*"kongen" + 0.014*"England" + '
'0.012*"svenske" + 0.012*"krigen" + 0.012*"Sverdrup" + 0.009*"Kristian" + '
'0.009*"Stortinget" + 0.009*"riksdagen"'),
(5,
'0.043*"Danmark" + 0.026*"Sverige" + 0.022*"kongen" + 0.020*"krigen" + '
'0.016*"bøndene" + 0.014*"krig" + 0.014*"danske" + 0.014*"konge" + '
'0.014*"England" + 0.010*"makt"'),
(6,
'0.027*"Sverige" + 0.013*"arbeiderne" + 0.013*"svenske" + 0.012*"Danmark" + '
'0.010*"bøndene" + 0.010*"danske" + 0.010*"England" + 0.009*"revolusjon" + '
'0.008*"industrien" + 0.008*"sjøl"'),
(7,
'0.027*"bøndene" + 0.019*"sjøl" + 0.019*"regjeringen" + 0.012*"bønder" + '
'0.011*"rike" + 0.011*"København" + 0.011*"tyskerne" + 0.010*"skip" + '
'0.010*"Rig" + 0.010*"Danmark"'),
(8,
'0.023*"bøndene" + 0.014*"industrien" + 0.012*"bønder" + 0.011*"kongen" + '
'0.010*"1850" + 0.009*"uti" + 0.009*"bøte" + 0.009*"drapsmannen" + '
'0.009*"skip" + 0.008*"sjøl"'),
(9,
'0.029*"kongen" + 0.026*"bøndene" + 0.025*"Sverige" + 0.024*"Stortinget" + '
'0.020*"svenske" + 0.014*"Danmark" + 0.014*"makt" + 0.013*"sjøl" + '
'0.013*"regjeringen" + 0.011*"krigen"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)