Tutorial¶

[1]:

import evalica
import pandas as pd
import plotly.express as px
from evalica import Winner, alpha_bootstrap, bootstrap, bradley_terry

%config InlineBackend.figure_formats = ['svg']

[2]:

evalica.__version__

[2]:

'0.4.2'

Pairwise Comparisons¶

[3]:

df_food = pd.read_csv(
    "https://raw.githubusercontent.com/dustalov/evalica/0893fd0f1e8107b2d62fd6c5816b55b417c1a050/food.csv",
    dtype=str,
)

df_food["winner"] = df_food["winner"].map(
    {
        "left": Winner.X,
        "right": Winner.Y,
        "tie": Winner.Draw,
    },
)

df_food.head(5)

[3]:

	left	right	winner
0	Pizza	Sushi	1
1	Burger	Pasta	2
2	Tacos	Pizza	1
3	Sushi	Tacos	2
4	Burger	Pizza	1

[4]:

df_food["left_id"], df_food["right_id"], index = evalica.indexing(df_food["left"], df_food["right"])

[5]:

matrices = evalica.matrices(df_food["left_id"], df_food["right_id"], df_food["winner"], index)

[6]:

pd.DataFrame(matrices.win_matrix, index=index, columns=index)  # win matrix

[6]:

	Pizza	Burger	Tacos	Sushi	Pasta
Pizza	0.0	0.0	0.0	2.0	1.0
Burger	3.0	0.0	1.0	2.0	0.0
Tacos	4.0	1.0	0.0	1.0	4.0
Sushi	0.0	4.0	1.0	0.0	1.0
Pasta	0.0	2.0	1.0	0.0	0.0

[7]:

pd.DataFrame(matrices.tie_matrix, index=index, columns=index)  # tie matrix

[7]:

	Pizza	Sushi	Pasta
Pizza	0.0	0.0	1.0
Burger	0.0	0.0	0.0
Tacos	0.0	0.0	0.0
Sushi	0.0	0.0	1.0
Pasta	1.0	1.0	0.0

[8]:

count_result = evalica.counting(df_food["left"], df_food["right"], df_food["winner"])
count_result.scores.to_frame()

[8]:

	counting
Tacos	10.0
Sushi	6.5
Burger	6.0
Pasta	4.0
Pizza	3.5

[9]:

avr_result = evalica.average_win_rate(df_food["left"], df_food["right"], df_food["winner"])
avr_result.scores.to_frame()

[9]:

	average_win_rate
Tacos	0.700000
Sushi	0.479167
Burger	0.458333
Pizza	0.437500
Pasta	0.425000

[10]:

bt_result = evalica.bradley_terry(df_food["left"], df_food["right"], df_food["winner"])
bt_result.scores.to_frame()

[10]:

	bradley_terry
Tacos	2.509025
Sushi	1.101156
Burger	0.854906
Pasta	0.740381
Pizza	0.571837

[11]:

newman_result = evalica.newman(df_food["left"], df_food["right"], df_food["winner"])
newman_result.scores.to_frame()

[11]:

	newman
Tacos	2.665211
Sushi	1.090627
Burger	0.829660
Pasta	0.710154
Pizza	0.536813

[12]:

eigen_result = evalica.eigen(df_food["left"], df_food["right"], df_food["winner"])
eigen_result.scores.to_frame()

[12]:

	eigen
Burger	0.528534
Pizza	0.505309
Sushi	0.454870
Pasta	0.439948
Tacos	0.254665

[13]:

elo_result = evalica.elo(df_food["left"], df_food["right"], df_food["winner"])
elo_result.scores.to_frame()

[13]:

	elo
Tacos	1013.358777
Sushi	1002.098059
Burger	998.026093
Pasta	994.191306
Pizza	992.325765

[14]:

df_bt_pairwise = evalica.pairwise_frame(bt_result.scores)

df_bt_pairwise

[14]:

	Tacos	Sushi	Burger	Pasta	Pizza
Tacos	0.500000	0.694986	0.745861	0.772149	0.814391
Sushi	0.305014	0.500000	0.562945	0.597955	0.658195
Burger	0.254139	0.437055	0.500000	0.535895	0.599201
Pasta	0.227851	0.402045	0.464105	0.500000	0.564221
Pizza	0.185609	0.341805	0.400799	0.435779	0.500000

[15]:

fig = px.imshow(df_bt_pairwise, color_continuous_scale="RdBu", text_auto=".2f")
fig.update_layout(xaxis_title="Loser", yaxis_title="Winner", xaxis_side="top")
fig.update_traces(hovertemplate="Winner: %{y}<br>Loser: %{x}<br>Fraction of Wins: %{z}")
fig.show()

[16]:

bootstrap_result = bootstrap(
    bradley_terry,
    df_food["left"],
    df_food["right"],
    df_food["winner"],
    n_resamples=10,
    random_state=42,
)

df_melted = bootstrap_result.distribution.melt(var_name="Item", value_name="Score")

df_melted.head(5)

[16]:

	Item	Score
0	Tacos	3.965574
1	Tacos	3.484208
2	Tacos	3.370370
3	Tacos	1.447897
4	Tacos	3.407976

[17]:

fig = px.box(df_melted, x="Score", y="Item", color="Item", title="Bradley–Terry Bootstrap Scores")
fig.update_traces(hovertemplate="<b>%{y}</b><br>Score: %{x:.3f}<extra></extra>")
fig.show()

Inter-Rater Reliability¶

[18]:

df_codings = pd.read_csv(
    "https://raw.githubusercontent.com/dustalov/evalica/d356c3988fdf9c1db249767413a7a8a1f49d64c0/codings.csv",
    header=None,
    dtype=str,
)

df_codings

[18]:

	0	1	2	3	4	5	6	7	8	9	10	11
0	1	2	3	3	2	1	4	1	2	NaN	NaN	NaN
1	1	2	3	3	2	2	4	1	2	5	NaN	3
2	NaN	3	3	3	2	3	4	2	2	5	1	NaN
3	1	2	3	3	2	4	4	1	2	5	1	NaN

[19]:

distances = ["nominal", "ordinal", "interval", "ratio"]
alpha_values = {dist: evalica.alpha(df_codings, distance=dist).alpha for dist in distances}  # type: ignore[arg-type]

pd.Series(alpha_values, name="alpha").to_frame()

[19]:

	alpha
nominal	0.743421
ordinal	0.815388
interval	0.849107
ratio	0.797403

Confidence Intervals¶

Evalica can also compute confidence intervals for Krippendorff’s alpha using bootstrapping.

[20]:

alpha_bootstrap_result = alpha_bootstrap(
    df_codings,
    distance="nominal",
    n_resamples=1000,
    confidence_level=0.95,
    random_state=42,
)

fig = px.histogram(
    alpha_bootstrap_result.distribution,
    nbins=50,
    title="Krippendorff's Alpha Bootstrap Distribution",
    labels={"value": "Alpha", "count": "Frequency"},
)

fig.add_vline(x=alpha_bootstrap_result.alpha, line_dash="dash", line_color="red", annotation_text="Point Estimate")
fig.add_vline(x=alpha_bootstrap_result.low, line_dash="dot", line_color="blue", annotation_text="Lower Bound")
fig.add_vline(x=alpha_bootstrap_result.high, line_dash="dot", line_color="blue", annotation_text="Upper Bound")
fig.show()

	0	1	2	3	4	5	6	7	8	9	10	11
0	1	2	3	3	2	1	4	1	2	NaN	NaN	NaN
1	1	2	3	3	2	2	4	1	2	5	NaN	3
2	NaN	3	3	3	2	3	4	2	2	5	1	NaN
3	1	2	3	3	2	4	4	1	2	5	1	NaN

	0	1	2	3	4	5	6	7	8	9	10	11
0	1	2	3	3	2	1	4	1	2	NaN	NaN	NaN
1	1	2	3	3	2	2	4	1	2	5	NaN	3
2	NaN	3	3	3	2	3	4	2	2	5	1	NaN
3	1	2	3	3	2	4	4	1	2	5	1	NaN

	0	1	2	3	4	5	6	7	8	9	10	11
0	1	2	3	3	2	1	4	1	2	NaN	NaN	NaN
1	1	2	3	3	2	2	4	1	2	5	NaN	3
2	NaN	3	3	3	2	3	4	2	2	5	1	NaN
3	1	2	3	3	2	4	4	1	2	5	1	NaN