#!pip install -U dhlab
import os 
import datetime as dt
import pandas as pd

import dhlab as dh
from code.sentiment import load_corpus_from_file, compute_sentiment_analysis, add_urls

2.1. Sentimentanalyse i aviskorpus#

  1. Angi nøkkelord

  2. Hent inn korpus

  3. Score sentiment for forekomster av et ord

  4. Lagre resultater i CSV-fil

# INPUT KREVES
word = "biblioteket"    # Nøkkelord som skal forekomme i tekstene

2.1.1. Korpusdefinisjon#

Fyll inn filsti til lokal csv- eller excel-fil med korpusdefinisjon:

# INPUT KREVES
file_path = "FYLL INN"

Eventuelt definer et korpus direkte med parametere.

# INPUT KREVES
city="Kristiansand"     # Publiseringssted for avisene
from_year=2000          # Start for tidsperiode det søkes i
to_year=2022            # Slutt for tidsperiode
number_of_docs=10000    # Maks antall dokumenter i tekstutvalget
# corpus-variabelen peker til korpusdefinisjonen, som et dhlab.Corpus-objekt

if os.path.exists(file_path):
    corpus = load_corpus_from_file(file_path)

    print("Lastet corpus fra fil: ", file_path)
else:
    corpus = dh.Corpus(
        doctype="digavis", 
        fulltext=word, 
        freetext=f"city: {city}",
        from_year=from_year,
        to_year = to_year,
        limit=number_of_docs
    )

    print("Lastet corpus fra parametere: ")
    print(f"Aviser, utgitt mellom {from_year} og {to_year} i {city}, der ordet \"{word}\" forekommer.")

print(f"corpus består av {corpus.size} utgivelser")
Lastet corpus fra parametere: 
Aviser, utgitt mellom 2000 og 2022 i Kristiansand, der ordet "biblioteket" forekommer.
corpus består av 3441 utgivelser

Valgfritt: Sett save_to_file=True hvis du vil lagre resulterende korpus til en ny, datostemplet CSV-fil

# INPUT KREVES
save_to_file=False       # Hvorvidt tekstutvalget skal lagres som CSV-fil, til senere bruk

if save_to_file:
    new_file_path = f"corpus_avis_{corpus.frame.year.min()}_{corpus.frame.year.max()}_{city}_{word}_created{dt.date.today()}.csv"
    corpus.to_csv(new_file_path)

2.1.2. Sentimentscore#

Kontekstene der nøkkelordet forekommer hentes ut fra korpuset, og vi beregner er sentimentscore på hvert tekstutdrag.

Det er en enkel analyse, med positive og negative ordlister: Vi regner ut differansen mellom summene av positive og negative ord i konteksten.

Ordlistene vi bruker her, NorSentLex, er utviklet av LTG-gruppen ved UiO ifm. SANT-prosjektet.

# Beregn en sentimentscore til kontekstene der nøkkelordet forekommer.
result = compute_sentiment_analysis(corpus, word)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[7], line 2
      1 # Beregn en sentimentscore til kontekstene der nøkkelordet forekommer.
----> 2 result = compute_sentiment_analysis(corpus, word)

File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:260, in compute_sentiment_analysis(*args, **kwargs)
    258 def compute_sentiment_analysis(*args, **kwargs):
    259     """Compute sentiment score on the input data."""
--> 260     return count_and_score_target_words(*args, **kwargs)

File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:243, in count_and_score_target_words(corpus, word)
    237 word_freq = word_freq.merge(
    238     conc, how="inner", left_on=docid_column, right_on=docid_column
    239 )
    241 pos, neg = load_norsentlex()
--> 243 word_freq[["positive", "negative"]] = word_freq.apply(
    244     lambda x: score_sentiment(x.conc, pos, neg), axis=1, result_type="expand"
    245 )
    246 word_freq["sentimentscore"] = word_freq["positive"] - word_freq["negative"]
    248 df = corpus.frame.merge(
    249     word_freq.drop(columns="conc"),
    250     how="inner",
    251     left_on=docid_column,
    252     right_on=docid_column,
    253 )

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/frame.py:9423, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
   9412 from pandas.core.apply import frame_apply
   9414 op = frame_apply(
   9415     self,
   9416     func=func,
   (...)
   9421     kwargs=kwargs,
   9422 )
-> 9423 return op.apply().__finalize__(self, method="apply")

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/apply.py:678, in FrameApply.apply(self)
    675 elif self.raw:
    676     return self.apply_raw()
--> 678 return self.apply_standard()

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/apply.py:798, in FrameApply.apply_standard(self)
    797 def apply_standard(self):
--> 798     results, res_index = self.apply_series_generator()
    800     # wrap results
    801     return self.wrap_results(results, res_index)

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/apply.py:814, in FrameApply.apply_series_generator(self)
    811 with option_context("mode.chained_assignment", None):
    812     for i, v in enumerate(series_gen):
    813         # ignore SettingWithCopy here in case the user mutates
--> 814         results[i] = self.f(v)
    815         if isinstance(results[i], ABCSeries):
    816             # If we have a view on v, we need to make a copy because
    817             #  series_generator will swap out the underlying data
    818             results[i] = results[i].copy(deep=False)

File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:244, in count_and_score_target_words.<locals>.<lambda>(x)
    237 word_freq = word_freq.merge(
    238     conc, how="inner", left_on=docid_column, right_on=docid_column
    239 )
    241 pos, neg = load_norsentlex()
    243 word_freq[["positive", "negative"]] = word_freq.apply(
--> 244     lambda x: score_sentiment(x.conc, pos, neg), axis=1, result_type="expand"
    245 )
    246 word_freq["sentimentscore"] = word_freq["positive"] - word_freq["negative"]
    248 df = corpus.frame.merge(
    249     word_freq.drop(columns="conc"),
    250     how="inner",
    251     left_on=docid_column,
    252     right_on=docid_column,
    253 )

File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:217, in score_sentiment(text, positive, negative)
    215 """Calculate a sentiment score for the ``text`` input."""
    216 context = count_tokens(text)
--> 217 sent_counts = [
    218     count_matching_tokens(context, sent_terms).counts.sum()
    219     if not context.empty
    220     else 0
    221     for sent_terms in (positive, negative)
    222 ]
    223 return sent_counts

File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:218, in <listcomp>(.0)
    215 """Calculate a sentiment score for the ``text`` input."""
    216 context = count_tokens(text)
    217 sent_counts = [
--> 218     count_matching_tokens(context, sent_terms).counts.sum()
    219     if not context.empty
    220     else 0
    221     for sent_terms in (positive, negative)
    222 ]
    223 return sent_counts

File ~/projects/digital_tekstanalyse/cookbook/code/sentiment.py:144, in count_matching_tokens(token_counts, terms)
    142 def count_matching_tokens(token_counts: pd.DataFrame, terms: pd.Series) -> pd.DataFrame:
    143     """Combine word counts with a series of terms."""
--> 144     target_terms = terms.join(token_counts, how="inner", on="terms")
    145     return target_terms

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/frame.py:9729, in DataFrame.join(self, other, on, how, lsuffix, rsuffix, sort, validate)
   9566 def join(
   9567     self,
   9568     other: DataFrame | Series | Iterable[DataFrame | Series],
   (...)
   9574     validate: str | None = None,
   9575 ) -> DataFrame:
   9576     """
   9577     Join columns of another DataFrame.
   9578 
   (...)
   9727     5  K1  A5   B1
   9728     """
-> 9729     return self._join_compat(
   9730         other,
   9731         on=on,
   9732         how=how,
   9733         lsuffix=lsuffix,
   9734         rsuffix=rsuffix,
   9735         sort=sort,
   9736         validate=validate,
   9737     )

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/frame.py:9768, in DataFrame._join_compat(self, other, on, how, lsuffix, rsuffix, sort, validate)
   9758     if how == "cross":
   9759         return merge(
   9760             self,
   9761             other,
   (...)
   9766             validate=validate,
   9767         )
-> 9768     return merge(
   9769         self,
   9770         other,
   9771         left_on=on,
   9772         how=how,
   9773         left_index=on is None,
   9774         right_index=True,
   9775         suffixes=(lsuffix, rsuffix),
   9776         sort=sort,
   9777         validate=validate,
   9778     )
   9779 else:
   9780     if on is not None:

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:162, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
    131 @Substitution("\nleft : DataFrame or named Series")
    132 @Appender(_merge_doc, indents=0)
    133 def merge(
   (...)
    146     validate: str | None = None,
    147 ) -> DataFrame:
    148     op = _MergeOperation(
    149         left,
    150         right,
   (...)
    160         validate=validate,
    161     )
--> 162     return op.get_result(copy=copy)

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:809, in _MergeOperation.get_result(self, copy)
    806 if self.indicator:
    807     self.left, self.right = self._indicator_pre_merge(self.left, self.right)
--> 809 join_index, left_indexer, right_indexer = self._get_join_info()
    811 result = self._reindex_and_concat(
    812     join_index, left_indexer, right_indexer, copy=copy
    813 )
    814 result = result.__finalize__(self, method=self._merge_type)

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1065, in _MergeOperation._get_join_info(self)
   1061     join_index, right_indexer, left_indexer = _left_join_on_index(
   1062         right_ax, left_ax, self.right_join_keys, sort=self.sort
   1063     )
   1064 else:
-> 1065     (left_indexer, right_indexer) = self._get_join_indexers()
   1067     if self.right_index:
   1068         if len(self.left) > 0:

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1038, in _MergeOperation._get_join_indexers(self)
   1036 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
   1037     """return the join indexers"""
-> 1038     return get_join_indexers(
   1039         self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
   1040     )

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1665, in get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
   1660 # get left & right join labels and num. of levels at each location
   1661 mapped = (
   1662     _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
   1663     for n in range(len(left_keys))
   1664 )
-> 1665 zipped = zip(*mapped)
   1666 llab, rlab, shape = (list(x) for x in zipped)
   1668 # get flat i8 keys from label lists

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:1662, in <genexpr>(.0)
   1658         return _get_no_sort_one_missing_indexer(left_n, False)
   1660 # get left & right join labels and num. of levels at each location
   1661 mapped = (
-> 1662     _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
   1663     for n in range(len(left_keys))
   1664 )
   1665 zipped = zip(*mapped)
   1666 llab, rlab, shape = (list(x) for x in zipped)

File ~/.cache/pypoetry/virtualenvs/digital-tekstanalyse-kDjkoATB-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py:2442, in _factorize_keys(lk, rk, sort, how)
   2435     rlab = rizer.factorize(
   2436         rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
   2437     )
   2438 else:
   2439     # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
   2440     # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
   2441     # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
-> 2442     llab = rizer.factorize(lk)  # type: ignore[arg-type]
   2443     rlab = rizer.factorize(rk)  # type: ignore[arg-type]
   2444 assert llab.dtype == np.dtype(np.intp), llab.dtype

KeyboardInterrupt: 
result.head()
dhlabid urn title city timestamp year doctype word count positive negative sentimentscore
0 201310916 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040211 2004 digavis biblioteket 4 0 0 0
1 201310916 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040211 2004 digavis biblioteket 4 1 0 1
2 201310916 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040211 2004 digavis biblioteket 4 0 1 -1
3 201310897 URN:NBN:no-nb_digavis_oestsida_null_null_20021... oestsida Kristiansand 20021204 2002 digavis biblioteket 1 3 1 2
4 201310918 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040310 2004 digavis biblioteket 6 1 2 -1

2.1.2.1. Tilbakekobling fra datapunkt til kildetekst:#

URNene i datarammen er referanser til de scannede dokumentene. Ved å bytte ut “URN” i følgende adresse med en reell verdi kan du lese avisen i nettbiblioteket (nb.no): https://www.nb.no/items/URN

f.eks. https://www.nb.no/items/URN:NBN:no-nb_digavis_kristiansandavis_null_null_20100204_4_5_1

Vi har også en funksjon som konstruerer URLene og lagrer dem til en ny kolonne i datarammen. Outputet fra kodeblokken under må kopieres og limes inn i nettleseren.

row_number = 1  # Endre radnummeret etter eget ønske

result = add_urls(result)
print(result.url[row_number])
https://www.nb.no/items/URN:NBN:no-nb_digavis_oestsida_null_null_20040211_3_3_1?searchText=biblioteket
result
dhlabid urn title city timestamp year doctype word count positive negative sentimentscore url
0 201310916 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040211 2004 digavis biblioteket 4 0 0 0 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
1 201310916 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040211 2004 digavis biblioteket 4 1 0 1 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
2 201310916 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040211 2004 digavis biblioteket 4 0 1 -1 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
3 201310897 URN:NBN:no-nb_digavis_oestsida_null_null_20021... oestsida Kristiansand 20021204 2002 digavis biblioteket 1 3 1 2 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
4 201310918 URN:NBN:no-nb_digavis_oestsida_null_null_20040... oestsida Kristiansand 20040310 2004 digavis biblioteket 6 1 2 -1 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
... ... ... ... ... ... ... ... ... ... ... ... ... ...
5577 200483072 URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... faedrelandsvennen Kristiansand 20021008 2002 digavis biblioteket 10 1 1 0 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
5578 200486058 URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... faedrelandsvennen Kristiansand 20160830 2016 digavis biblioteket 1 1 2 -1 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
5579 200485951 URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... faedrelandsvennen Kristiansand 20160423 2016 digavis biblioteket 2 1 0 1 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
5580 200485951 URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... faedrelandsvennen Kristiansand 20160423 2016 digavis biblioteket 2 1 0 1 https://www.nb.no/items/URN:NBN:no-nb_digavis_...
5581 200482629 URN:NBN:no-nb_digavis_faedrelandsvennen_null_n... faedrelandsvennen Kristiansand 20010417 2001 digavis biblioteket 1 5 0 5 https://www.nb.no/items/URN:NBN:no-nb_digavis_...

5582 rows × 13 columns

2.1.3. Visualiser resultatet#

r = result[["year","positive", "negative", "sentimentscore"]]
rgroup = r.groupby("year")[["sentimentscore", "positive", "negative"]].sum() 
rgroup.plot()
<Axes: xlabel='year'>
../_images/90546ad084eb777b4f87ed8da158314331312fc9caecd994e49553f28b7b2a78.png

2.1.4. Lagre data#

Skriv utdata til en CSV-fil på ditt lokale filsystem:

Kolonne

Beskrivelse

dhlabid

DH-labens ID-nummer for det digitale tekstobjektet (OCR-scannet tekst) i databasene

urn

Unique Resource Name (digitalt bilde av tekstdokumentet, tilgjengelig i nettbiblioteket)

title

Avistittel, navn på publikasjon

city

Publiseringssted (oftest en by)

timestamp

datostempel i ISO-format (YYYYMMDD)

year

årstall for publikasjonen

doctype

Dokumenttype (her er det bare aviser, “digavis”)

word

nøkkelord i tekstutdragene (konkordansene) som sentimentanalysen ble utført på

count

ordfrekvens: antall ganger nøkkelordet forekommer i den gitte avisutgivelsen

positive

antall positive ord i kontekstene nøkkelordet forekommer i

negative

antall negative ord i kontekstene

sentimentscore

differansen positiv score - negativ score

url

lenke til avisen i nettbiblioteket, inkl. søk på nøkkelordet

outputfile = f"sentimentanalyse_aviskorpus_{from_year}-{to_year}_{dt.date.today()}.csv"
result.to_csv(outputfile)