Tutorial

Open in Colab Binder

[1]:
import evalica
import pandas as pd
import plotly.express as px
from evalica import Winner, alpha_bootstrap, bootstrap, bradley_terry

%config InlineBackend.figure_formats = ['svg']
[2]:
evalica.__version__
[2]:
'0.4.2'

Pairwise Comparisons

[3]:
df_food = pd.read_csv(
    "https://raw.githubusercontent.com/dustalov/evalica/0893fd0f1e8107b2d62fd6c5816b55b417c1a050/food.csv",
    dtype=str,
)

df_food["winner"] = df_food["winner"].map(
    {
        "left": Winner.X,
        "right": Winner.Y,
        "tie": Winner.Draw,
    },
)

df_food.head(5)
[3]:
left right winner
0 Pizza Sushi 1
1 Burger Pasta 2
2 Tacos Pizza 1
3 Sushi Tacos 2
4 Burger Pizza 1
[4]:
df_food["left_id"], df_food["right_id"], index = evalica.indexing(df_food["left"], df_food["right"])
[5]:
matrices = evalica.matrices(df_food["left_id"], df_food["right_id"], df_food["winner"], index)
[6]:
pd.DataFrame(matrices.win_matrix, index=index, columns=index)  # win matrix
[6]:
Pizza Burger Tacos Sushi Pasta
Pizza 0.0 0.0 0.0 2.0 1.0
Burger 3.0 0.0 1.0 2.0 0.0
Tacos 4.0 1.0 0.0 1.0 4.0
Sushi 0.0 4.0 1.0 0.0 1.0
Pasta 0.0 2.0 1.0 0.0 0.0
[7]:
pd.DataFrame(matrices.tie_matrix, index=index, columns=index)  # tie matrix
[7]:
Pizza Burger Tacos Sushi Pasta
Pizza 0.0 0.0 0.0 0.0 1.0
Burger 0.0 0.0 0.0 0.0 0.0
Tacos 0.0 0.0 0.0 0.0 0.0
Sushi 0.0 0.0 0.0 0.0 1.0
Pasta 1.0 0.0 0.0 1.0 0.0
[8]:
count_result = evalica.counting(df_food["left"], df_food["right"], df_food["winner"])
count_result.scores.to_frame()
[8]:
counting
Tacos 10.0
Sushi 6.5
Burger 6.0
Pasta 4.0
Pizza 3.5
[9]:
avr_result = evalica.average_win_rate(df_food["left"], df_food["right"], df_food["winner"])
avr_result.scores.to_frame()
[9]:
average_win_rate
Tacos 0.700000
Sushi 0.479167
Burger 0.458333
Pizza 0.437500
Pasta 0.425000
[10]:
bt_result = evalica.bradley_terry(df_food["left"], df_food["right"], df_food["winner"])
bt_result.scores.to_frame()
[10]:
bradley_terry
Tacos 2.509025
Sushi 1.101156
Burger 0.854906
Pasta 0.740381
Pizza 0.571837
[11]:
newman_result = evalica.newman(df_food["left"], df_food["right"], df_food["winner"])
newman_result.scores.to_frame()
[11]:
newman
Tacos 2.665211
Sushi 1.090627
Burger 0.829660
Pasta 0.710154
Pizza 0.536813
[12]:
eigen_result = evalica.eigen(df_food["left"], df_food["right"], df_food["winner"])
eigen_result.scores.to_frame()
[12]:
eigen
Burger 0.528534
Pizza 0.505309
Sushi 0.454870
Pasta 0.439948
Tacos 0.254665
[13]:
elo_result = evalica.elo(df_food["left"], df_food["right"], df_food["winner"])
elo_result.scores.to_frame()
[13]:
elo
Tacos 1013.358777
Sushi 1002.098059
Burger 998.026093
Pasta 994.191306
Pizza 992.325765
[14]:
df_bt_pairwise = evalica.pairwise_frame(bt_result.scores)

df_bt_pairwise
[14]:
Tacos Sushi Burger Pasta Pizza
Tacos 0.500000 0.694986 0.745861 0.772149 0.814391
Sushi 0.305014 0.500000 0.562945 0.597955 0.658195
Burger 0.254139 0.437055 0.500000 0.535895 0.599201
Pasta 0.227851 0.402045 0.464105 0.500000 0.564221
Pizza 0.185609 0.341805 0.400799 0.435779 0.500000
[15]:
fig = px.imshow(df_bt_pairwise, color_continuous_scale="RdBu", text_auto=".2f")
fig.update_layout(xaxis_title="Loser", yaxis_title="Winner", xaxis_side="top")
fig.update_traces(hovertemplate="Winner: %{y}<br>Loser: %{x}<br>Fraction of Wins: %{z}")
fig.show()
[16]:
bootstrap_result = bootstrap(
    bradley_terry,
    df_food["left"],
    df_food["right"],
    df_food["winner"],
    n_resamples=10,
    random_state=42,
)

df_melted = bootstrap_result.distribution.melt(var_name="Item", value_name="Score")

df_melted.head(5)
[16]:
Item Score
0 Tacos 3.965574
1 Tacos 3.484208
2 Tacos 3.370370
3 Tacos 1.447897
4 Tacos 3.407976
[17]:
fig = px.box(df_melted, x="Score", y="Item", color="Item", title="Bradley–Terry Bootstrap Scores")
fig.update_traces(hovertemplate="<b>%{y}</b><br>Score: %{x:.3f}<extra></extra>")
fig.show()

Inter-Rater Reliability

[18]:
df_codings = pd.read_csv(
    "https://raw.githubusercontent.com/dustalov/evalica/d356c3988fdf9c1db249767413a7a8a1f49d64c0/codings.csv",
    header=None,
    dtype=str,
)

df_codings
[18]:
0 1 2 3 4 5 6 7 8 9 10 11
0 1 2 3 3 2 1 4 1 2 NaN NaN NaN
1 1 2 3 3 2 2 4 1 2 5 NaN 3
2 NaN 3 3 3 2 3 4 2 2 5 1 NaN
3 1 2 3 3 2 4 4 1 2 5 1 NaN
[19]:
distances = ["nominal", "ordinal", "interval", "ratio"]
alpha_values = {dist: evalica.alpha(df_codings, distance=dist).alpha for dist in distances}  # type: ignore[arg-type]

pd.Series(alpha_values, name="alpha").to_frame()
[19]:
alpha
nominal 0.743421
ordinal 0.815388
interval 0.849107
ratio 0.797403

Confidence Intervals

Evalica can also compute confidence intervals for Krippendorff’s alpha using bootstrapping.

[20]:
alpha_bootstrap_result = alpha_bootstrap(
    df_codings,
    distance="nominal",
    n_resamples=1000,
    confidence_level=0.95,
    random_state=42,
)

fig = px.histogram(
    alpha_bootstrap_result.distribution,
    nbins=50,
    title="Krippendorff's Alpha Bootstrap Distribution",
    labels={"value": "Alpha", "count": "Frequency"},
)

fig.add_vline(x=alpha_bootstrap_result.alpha, line_dash="dash", line_color="red", annotation_text="Point Estimate")
fig.add_vline(x=alpha_bootstrap_result.low, line_dash="dot", line_color="blue", annotation_text="Lower Bound")
fig.add_vline(x=alpha_bootstrap_result.high, line_dash="dot", line_color="blue", annotation_text="Upper Bound")
fig.show()