Code
import pandas as pd
import polars as pl
import os
import altair as alt
import polars.selectors as cs
from camminapy.plot import altair_theme
Code
force = True
if os.path.exists(filename := "data.parquet") and not force:
    df = pl.read_parquet(filename)
else:
    df = pl.DataFrame(
        pd.concat(
            [
                pd.read_html(f"https://www.letour.fr/en/rankings/stage-{i}")[0].assign(
                    stage=i
                )
                for i in range(1, 18)
            ]
        )
    )
    df.write_parquet(filename)

df = (
    df.with_columns(
        pl.col("Times")
        .str.strptime(pl.Time, format="""%Hh %M' %S''""")
        .cast(pl.Duration)
    )
    .with_columns(
        pl.col("Gap")
        .apply(lambda s: str(s).replace("+ ", "").replace("-", "0h 0' 0''"))
        .str.strptime(pl.Time, format="""%Hh %M' %S''""", strict=True)
        .cast(pl.Duration)
    )
    .rename(mapping={"stage": "Stage"})
    .select("Stage", "Rank", "Rider", "Team", "Rider No.", "Times", "Gap")
    .with_columns(pl.col("Times").rank().over("Stage").alias("Overall Rank"))
    .with_columns(
        pl.col("Rider").cast(pl.Categorical),
        pl.col("Team").cast(pl.Categorical),
    )
    .sort("Stage", "Times")
    .with_columns(pl.col("Times").cumsum().over("Rider").alias("Overall Time"))
    .with_columns(
        pl.col("Overall Time").rank().over("Stage").alias("Overall Rank").cast(int)
    )
    .with_columns(
        (pl.col("Overall Time") - pl.col("Overall Time").min())
        .over("Stage")
        .alias("Overall Gap")
    )
)

df
shape: (2_847, 10)
Stage Rank Rider Team Rider No. Times Gap Overall Rank Overall Time Overall Gap
i64 i64 cat cat i64 duration[μs] duration[μs] i64 duration[μs] duration[μs]
1 1 "A. YATES" "UAE TEAM EMIRA… 19 4h 22m 49s 0µs 1 4h 22m 49s 0µs
1 2 "S. YATES" "TEAM JAYCO ALU… 161 4h 22m 53s 4s 2 4h 22m 53s 4s
1 3 "T. POGAČAR" "UAE TEAM EMIRA… 11 4h 23m 1s 12s 8 4h 23m 1s 12s
1 4 "T. PINOT" "GROUPAMA - FDJ… 37 4h 23m 1s 12s 8 4h 23m 1s 12s
1 5 "M. WOODS" "ISRAEL - PREMI… 151 4h 23m 1s 12s 8 4h 23m 1s 12s
1 6 "V. LAFAY" "COFIDIS" 125 4h 23m 1s 12s 8 4h 23m 1s 12s
1 7 "J. HINDLEY" "BORA - HANSGRO… 71 4h 23m 1s 12s 8 4h 23m 1s 12s
1 8 "S. JENSEN" "LIDL - TREK" 83 4h 23m 1s 12s 8 4h 23m 1s 12s
1 9 "J. VINGEGAARD" "JUMBO-VISMA" 1 4h 23m 1s 12s 8 4h 23m 1s 12s
1 10 "D. GAUDU" "GROUPAMA - FDJ… 31 4h 23m 1s 12s 8 4h 23m 1s 12s
1 11 "W. VAN AERT" "JUMBO-VISMA" 6 4h 23m 1s 12s 8 4h 23m 1s 12s
1 12 "M. LANDA" "BAHRAIN VICTOR… 62 4h 23m 1s 12s 8 4h 23m 1s 12s
17 143 "Y. FEDOROV" "ASTANA QAZAQST… 194 5h 30m 47s 41m 39s 151 3d 1h 18m 30s 5h 20m 14s
17 144 "C. BOL" "ASTANA QAZAQST… 192 5h 30m 47s 41m 39s 153 3d 1h 21m 4s 5h 22m 48s
17 145 "M. MØRKØV" "SOUDAL QUICK-S… 58 5h 30m 57s 41m 49s 154 3d 1h 30m 31s 5h 32m 15s
17 146 "M. PEDERSEN" "LIDL - TREK" 86 5h 30m 58s 41m 50s 114 3d 23m 26s 4h 25m 10s
17 147 "A. KIRSCH" "LIDL - TREK" 84 5h 30m 58s 41m 50s 115 3d 23m 52s 4h 25m 36s
17 148 "A. PETIT" "INTERMARCHÉ - … 115 5h 31m 10s 42m 2s 144 3d 1h 3m 44s 5h 5m 28s
17 149 "S. WÆRENSKJOLD… "UNO-X PRO CYCL… 207 5h 31m 22s 42m 14s 143 3d 1h 1m 37s 5h 3m 21s
17 150 "E. REINDERS" "TEAM JAYCO ALU… 168 5h 31m 28s 42m 20s 142 3d 1h 49s 5h 2m 33s
17 151 "D. GROENEWEGEN… "TEAM JAYCO ALU… 164 5h 31m 28s 42m 20s 146 3d 1h 5m 49s 5h 7m 33s
17 152 "L. MEZGEC" "TEAM JAYCO ALU… 167 5h 31m 28s 42m 20s 124 3d 35m 58s 4h 37m 42s
17 153 "F. FRISON" "LOTTO DSTNY" 185 5h 31m 40s 42m 32s 152 3d 1h 18m 35s 5h 20m 19s
17 154 "S. GESCHKE" "COFIDIS" 123 5h 34m 4s 44m 56s 67 2d 23h 15m 45s 3h 17m 29s
Code
altair_theme()
base = (
    alt.Chart(
        df.select(~cs.by_dtype(pl.Duration))
        .filter(pl.col("Overall Rank") <= 10)
        .pivot(
            index=["Rider", "Team", "Rider No."],
            columns="Stage",
            values="Overall Rank",
            aggregate_function=None,
        )
        .melt(
            id_vars=["Rider", "Team", "Rider No."],
            value_vars=[str(i) for i in range(1, 17)],
            value_name="Overall Rank",
            variable_name="Stage",
        )
        .with_columns(pl.col("Stage").cast(int))
        .sort("Stage", "Overall Rank")
        .select("Stage", "Rider", "Team", "Rider No.", "Overall Rank")
        .fill_null(np.nan)
    )
    .encode(
        x=alt.X("Stage:O").axis(labelAlign="center", titleAnchor="middle"),
        y=alt.Y("Overall Rank:O").axis(orient="left").title("Classement général"),
        color=alt.Color(
            "Team:N",
            legend=alt.Legend(
                orient="none",
                legendX=100,
                legendY=100,
                direction="horizontal",
                titleAnchor="middle",
                columns=2,
            ),
        ),
        detail="Rider:N",
    )
    .properties(width=2000, height=400)
)

alt.layer(
    base.mark_line(strokeWidth=3, clip=True),
    base.mark_point(size=100, filled=True, opacity=1, clip=True),
    # base.mark_text(dy=-10, align="right", clip=True)
    # .encode(text="Rider:N")
    # .transform_filter(alt.datum["Stage"] == 16),
)
Code
laststage = 17
base = alt.Chart(
    df.drop("Times", "Gap")
    .with_columns(
        pl.col("Overall Time").dt.seconds(), pl.col("Overall Gap").dt.seconds()
    )
    .filter(pl.col("Overall Rank").last().over("Rider") <= 10)
    .with_columns(
        pl.when(pl.col("Overall Gap") == 984)
        .then(pl.lit(1010))
        .otherwise(pl.col("Overall Gap"))
        .alias("labelY")
    )
    .with_columns(
        pl.when(pl.col("labelY") == 708)
        .then(pl.lit(690))
        .otherwise(pl.col("labelY"))
        .alias("labelY")
    )
    .with_columns(
        pl.when(pl.col("labelY") == 759)
        .then(pl.lit(780))
        .otherwise(pl.col("labelY"))
        .alias("labelY")
    )
    .with_columns(pl.col("labelY") / 60)
    .with_columns(
        pl.struct(pl.all())
        .apply(
            lambda s: f"""{s["Rider"]} {"" if s["Overall Gap"]==0 else f' (+{s["Overall Gap"]//60}min {s["Overall Gap"]%60}s)'}"""
        )
        .alias("labelText")
    )
    .with_columns(
        (pl.col("Overall Gap") // 60)
        # .apply(lambda s: str(s) + "min")
        .alias("Overall Gap Min")
    )
    .with_columns((pl.col("Overall Rank")).last().over("Rider").alias("Final Rank"))
    .to_pandas()
).encode(
    x=alt.X("Stage:N").axis(
        titleAnchor="end",
        titleAlign="left",
        titleBaseline="bottom",
        labelBaseline="top",
        # titleX=-30,
        # titleFontSize=20,
    ),
    y=alt.Y("Overall Gap Min:Q")
    .scale(domain=(-0.5, 20), reverse=True)
    .axis(values=list(range(20)), format=".0d")
    .title(
        "Gap to yellow jersey",
    ),
    # .title(""),
    # .axis(None),
    # .scale(reverse=False),
    detail=alt.Detail("Rider"),
    order=alt.Order("Final Rank"),
    color=alt.Color("Team:N").legend(None),
)


(
    base.mark_line(clip=True, strokeWidth=3)
    + base.mark_point(clip=True, size=100, filled=True, opacity=1)
    + base.mark_line(clip=True, strokeWidth=3).transform_filter(
        alt.datum["Rider"] == "J. VINGEGAARD"
    )
    + base.mark_point(clip=True, size=100, filled=True, opacity=1).transform_filter(
        alt.datum["Rider"] == "J. VINGEGAARD"
    )
    + base.mark_text(clip=False, align="left", dx=20, fontSize=14)
    .encode(text="labelText:N", y="labelY")
    .transform_filter(alt.datum["Stage"] == laststage)
).properties(
    title={
        "text": "Classement Général Tour de France 2023",
        "dy": 00,
        "anchor": "middle",
        "subtitle": "Graphic by Thomas Camminady",
        "subtitleColor": "gray",
    },
    height=600,
)
Code
['J. VINGEGAARD',
 'T. POGAČAR',
 'A. YATES',
 'C. RODRIGUEZ CANO',
 'S. YATES',
 'P. BILBAO LOPEZ',
 'J. HINDLEY',
 'F. GALL',
 'S. KUSS',
 'D. GAUDU']