2.2. Topic modelling med gensim og dhlab#
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis
# sprit urn URN:NBN:no-nb_digibok_2020090207537
Finn en bok
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")
bok
dhlabid | urn | title | authors | oaiid | sesamid | isbn10 | city | timestamp | year | publisher | langs | subjects | ddc | genres | literaryform | doctype | ocr_creator | ocr_timestamp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100492480 | URN:NBN:no-nb_digibok_2016021108166 | En kortfattet norgeshistorie i versform | Søraas , O. | oai:nb.bibsys.no:990202045744702202 | 93a057269a403a3c94012884ac324ac2 | Trondhjem | 19080101 | 1908 | nob | Uklassifisert | digibok | nb | 20060101 |
urn = bok.frame.urn.tolist()[0]
urn
'URN:NBN:no-nb_digibok_2016021108166'
2.2.1. Chunking#
# Chunks
res = dh.Chunks(chunks=1000, urn=urn)
len(res.chunks)
5
def chunks_to_corpus(chunks_list):
res = []
for x in chunks_list:
inner_res = ""
for y in x:
inner_res += (y + " ") * x[y]
res.append(inner_res)
return res
texts = chunks_to_corpus(res.chunks)
2.2.2. Find delta TFIDF#
df = pd.DataFrame(res.chunks).transpose().fillna(0)
df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
! | 5.0 | 0.0 | 4.0 | 2.0 | 0.0 |
" | 3.0 | 0.0 | 3.0 | 1.0 | 3.0 |
' | 9.0 | 0.0 | 4.0 | 3.0 | 2.0 |
, | 64.0 | 1.0 | 67.0 | 62.0 | 69.0 |
- | 6.0 | 0.0 | 5.0 | 7.0 | 8.0 |
... | ... | ... | ... | ... | ... |
verden | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
verre | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
vilde | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
vokste | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
voldsom | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1625 rows × 5 columns
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)
# Divide corpus freq count by tot
res = df.sum(axis=1) / tot.freq
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index
df.loc[target_tokens]
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
regjerte | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 |
adel | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 |
Kalv | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 |
storslagen | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
velde | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
... | ... | ... | ... | ... | ... |
3 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
vil | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
Vi | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
dette | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
har | 0.0 | 0.0 | 0.0 | 2.0 | 3.0 |
1000 rows × 5 columns
outer_lst = []
for col in df.loc[target_tokens].columns:
inner_str = ""
for i, x in df.loc[target_tokens][col].items():
if x > 0:
inner_str += (i + " ") * int(x)
outer_lst.append(inner_str)
len(outer_lst)
5
2.2.3. Prep for LDA#
data = [x.split() for x in outer_lst]
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(chunk) for chunk in data]
2.2.4. Make model#
lda_model = gensim.models.LdaMulticore(
corpus = corpus,
id2word = id2word,
num_topics = 10
)
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
/usr/lib/python3.12/multiprocessing/popen_fork.py:66: DeprecationWarning: This process (pid=98222) is multi-threaded, use of fork() may lead to deadlocks in the child.
self.pid = os.fork()
pprint(lda_model.print_topics())
[(0,
'0.055*"," + 0.038*"og" + 0.023*"." + 0.017*"i" + 0.016*"han" + 0.012*"til" '
'+ 0.012*"med" + 0.010*"var" + 0.010*"som" + 0.009*";"'),
(1,
'0.046*"," + 0.043*"og" + 0.025*"." + 0.019*"han" + 0.016*"i" + 0.011*"som" '
'+ 0.011*"en" + 0.010*"med" + 0.010*"til" + 0.010*"av"'),
(2,
'0.023*"," + 0.020*"og" + 0.012*"." + 0.012*"i" + 0.008*";" + 0.007*"han" + '
'0.007*"til" + 0.006*"var" + 0.006*"en" + 0.006*"som"'),
(3,
'0.070*"," + 0.042*"og" + 0.037*"." + 0.025*"i" + 0.020*"han" + 0.015*"med" '
'+ 0.015*";" + 0.014*"en" + 0.013*"til" + 0.012*"av"'),
(4,
'0.068*"," + 0.051*"og" + 0.040*"." + 0.023*"i" + 0.019*"han" + 0.016*";" + '
'0.016*"en" + 0.012*"paa" + 0.012*"blev" + 0.012*"med"'),
(5,
'0.034*"," + 0.026*"og" + 0.020*"." + 0.014*"i" + 0.013*"han" + 0.009*";" + '
'0.009*"var" + 0.009*"den" + 0.008*"med" + 0.007*"som"'),
(6,
'0.027*"," + 0.018*"og" + 0.013*"." + 0.012*"i" + 0.009*"han" + 0.007*"med" '
'+ 0.006*";" + 0.006*"til" + 0.006*"var" + 0.005*"som"'),
(7,
'0.066*"," + 0.028*"og" + 0.024*"i" + 0.021*"." + 0.019*"han" + 0.014*";" + '
'0.014*"en" + 0.011*"med" + 0.010*"som" + 0.010*"til"'),
(8,
'0.076*"," + 0.050*"og" + 0.035*"." + 0.029*"i" + 0.023*"han" + 0.016*";" + '
'0.015*"den" + 0.013*"som" + 0.013*"til" + 0.013*"var"'),
(9,
'0.029*"," + 0.022*"og" + 0.021*"." + 0.013*"han" + 0.011*"i" + 0.010*"en" + '
'0.009*";" + 0.009*"som" + 0.008*"med" + 0.007*"av"')]
prep = genvis.prepare(lda_model, corpus, id2word)
# Save to html
# pyLDAvis.save_html(prep, "result.html")
pyLDAvis.enable_notebook()
pyLDAvis.display(prep)