Code
force = True
if os.path.exists(filename := "data.parquet") and not force:
df = pl.read_parquet(filename)
else:
df = pl.DataFrame(
pd.concat(
[
pd.read_html(f"https://www.letour.fr/en/rankings/stage-{i}")[0].assign(
stage=i
)
for i in range(1, 18)
]
)
)
df.write_parquet(filename)
df = (
df.with_columns(
pl.col("Times")
.str.strptime(pl.Time, format="""%Hh %M' %S''""")
.cast(pl.Duration)
)
.with_columns(
pl.col("Gap")
.apply(lambda s: str(s).replace("+ ", "").replace("-", "0h 0' 0''"))
.str.strptime(pl.Time, format="""%Hh %M' %S''""", strict=True)
.cast(pl.Duration)
)
.rename(mapping={"stage": "Stage"})
.select("Stage", "Rank", "Rider", "Team", "Rider No.", "Times", "Gap")
.with_columns(pl.col("Times").rank().over("Stage").alias("Overall Rank"))
.with_columns(
pl.col("Rider").cast(pl.Categorical),
pl.col("Team").cast(pl.Categorical),
)
.sort("Stage", "Times")
.with_columns(pl.col("Times").cumsum().over("Rider").alias("Overall Time"))
.with_columns(
pl.col("Overall Time").rank().over("Stage").alias("Overall Rank").cast(int)
)
.with_columns(
(pl.col("Overall Time") - pl.col("Overall Time").min())
.over("Stage")
.alias("Overall Gap")
)
)
df
shape: (2_847, 10)
Stage | Rank | Rider | Team | Rider No. | Times | Gap | Overall Rank | Overall Time | Overall Gap |
---|---|---|---|---|---|---|---|---|---|
i64 | i64 | cat | cat | i64 | duration[μs] | duration[μs] | i64 | duration[μs] | duration[μs] |
1 | 1 | "A. YATES" | "UAE TEAM EMIRA… | 19 | 4h 22m 49s | 0µs | 1 | 4h 22m 49s | 0µs |
1 | 2 | "S. YATES" | "TEAM JAYCO ALU… | 161 | 4h 22m 53s | 4s | 2 | 4h 22m 53s | 4s |
1 | 3 | "T. POGAČAR" | "UAE TEAM EMIRA… | 11 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 4 | "T. PINOT" | "GROUPAMA - FDJ… | 37 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 5 | "M. WOODS" | "ISRAEL - PREMI… | 151 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 6 | "V. LAFAY" | "COFIDIS" | 125 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 7 | "J. HINDLEY" | "BORA - HANSGRO… | 71 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 8 | "S. JENSEN" | "LIDL - TREK" | 83 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 9 | "J. VINGEGAARD" | "JUMBO-VISMA" | 1 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 10 | "D. GAUDU" | "GROUPAMA - FDJ… | 31 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 11 | "W. VAN AERT" | "JUMBO-VISMA" | 6 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
1 | 12 | "M. LANDA" | "BAHRAIN VICTOR… | 62 | 4h 23m 1s | 12s | 8 | 4h 23m 1s | 12s |
… | … | … | … | … | … | … | … | … | … |
17 | 143 | "Y. FEDOROV" | "ASTANA QAZAQST… | 194 | 5h 30m 47s | 41m 39s | 151 | 3d 1h 18m 30s | 5h 20m 14s |
17 | 144 | "C. BOL" | "ASTANA QAZAQST… | 192 | 5h 30m 47s | 41m 39s | 153 | 3d 1h 21m 4s | 5h 22m 48s |
17 | 145 | "M. MØRKØV" | "SOUDAL QUICK-S… | 58 | 5h 30m 57s | 41m 49s | 154 | 3d 1h 30m 31s | 5h 32m 15s |
17 | 146 | "M. PEDERSEN" | "LIDL - TREK" | 86 | 5h 30m 58s | 41m 50s | 114 | 3d 23m 26s | 4h 25m 10s |
17 | 147 | "A. KIRSCH" | "LIDL - TREK" | 84 | 5h 30m 58s | 41m 50s | 115 | 3d 23m 52s | 4h 25m 36s |
17 | 148 | "A. PETIT" | "INTERMARCHÉ - … | 115 | 5h 31m 10s | 42m 2s | 144 | 3d 1h 3m 44s | 5h 5m 28s |
17 | 149 | "S. WÆRENSKJOLD… | "UNO-X PRO CYCL… | 207 | 5h 31m 22s | 42m 14s | 143 | 3d 1h 1m 37s | 5h 3m 21s |
17 | 150 | "E. REINDERS" | "TEAM JAYCO ALU… | 168 | 5h 31m 28s | 42m 20s | 142 | 3d 1h 49s | 5h 2m 33s |
17 | 151 | "D. GROENEWEGEN… | "TEAM JAYCO ALU… | 164 | 5h 31m 28s | 42m 20s | 146 | 3d 1h 5m 49s | 5h 7m 33s |
17 | 152 | "L. MEZGEC" | "TEAM JAYCO ALU… | 167 | 5h 31m 28s | 42m 20s | 124 | 3d 35m 58s | 4h 37m 42s |
17 | 153 | "F. FRISON" | "LOTTO DSTNY" | 185 | 5h 31m 40s | 42m 32s | 152 | 3d 1h 18m 35s | 5h 20m 19s |
17 | 154 | "S. GESCHKE" | "COFIDIS" | 123 | 5h 34m 4s | 44m 56s | 67 | 2d 23h 15m 45s | 3h 17m 29s |
Code
altair_theme()
base = (
alt.Chart(
df.select(~cs.by_dtype(pl.Duration))
.filter(pl.col("Overall Rank") <= 10)
.pivot(
index=["Rider", "Team", "Rider No."],
columns="Stage",
values="Overall Rank",
aggregate_function=None,
)
.melt(
id_vars=["Rider", "Team", "Rider No."],
value_vars=[str(i) for i in range(1, 17)],
value_name="Overall Rank",
variable_name="Stage",
)
.with_columns(pl.col("Stage").cast(int))
.sort("Stage", "Overall Rank")
.select("Stage", "Rider", "Team", "Rider No.", "Overall Rank")
.fill_null(np.nan)
)
.encode(
x=alt.X("Stage:O").axis(labelAlign="center", titleAnchor="middle"),
y=alt.Y("Overall Rank:O").axis(orient="left").title("Classement général"),
color=alt.Color(
"Team:N",
legend=alt.Legend(
orient="none",
legendX=100,
legendY=100,
direction="horizontal",
titleAnchor="middle",
columns=2,
),
),
detail="Rider:N",
)
.properties(width=2000, height=400)
)
alt.layer(
base.mark_line(strokeWidth=3, clip=True),
base.mark_point(size=100, filled=True, opacity=1, clip=True),
# base.mark_text(dy=-10, align="right", clip=True)
# .encode(text="Rider:N")
# .transform_filter(alt.datum["Stage"] == 16),
)
Code
laststage = 17
base = alt.Chart(
df.drop("Times", "Gap")
.with_columns(
pl.col("Overall Time").dt.seconds(), pl.col("Overall Gap").dt.seconds()
)
.filter(pl.col("Overall Rank").last().over("Rider") <= 10)
.with_columns(
pl.when(pl.col("Overall Gap") == 984)
.then(pl.lit(1010))
.otherwise(pl.col("Overall Gap"))
.alias("labelY")
)
.with_columns(
pl.when(pl.col("labelY") == 708)
.then(pl.lit(690))
.otherwise(pl.col("labelY"))
.alias("labelY")
)
.with_columns(
pl.when(pl.col("labelY") == 759)
.then(pl.lit(780))
.otherwise(pl.col("labelY"))
.alias("labelY")
)
.with_columns(pl.col("labelY") / 60)
.with_columns(
pl.struct(pl.all())
.apply(
lambda s: f"""{s["Rider"]} {"" if s["Overall Gap"]==0 else f' (+{s["Overall Gap"]//60}min {s["Overall Gap"]%60}s)'}"""
)
.alias("labelText")
)
.with_columns(
(pl.col("Overall Gap") // 60)
# .apply(lambda s: str(s) + "min")
.alias("Overall Gap Min")
)
.with_columns((pl.col("Overall Rank")).last().over("Rider").alias("Final Rank"))
.to_pandas()
).encode(
x=alt.X("Stage:N").axis(
titleAnchor="end",
titleAlign="left",
titleBaseline="bottom",
labelBaseline="top",
# titleX=-30,
# titleFontSize=20,
),
y=alt.Y("Overall Gap Min:Q")
.scale(domain=(-0.5, 20), reverse=True)
.axis(values=list(range(20)), format=".0d")
.title(
"Gap to yellow jersey",
),
# .title(""),
# .axis(None),
# .scale(reverse=False),
detail=alt.Detail("Rider"),
order=alt.Order("Final Rank"),
color=alt.Color("Team:N").legend(None),
)
(
base.mark_line(clip=True, strokeWidth=3)
+ base.mark_point(clip=True, size=100, filled=True, opacity=1)
+ base.mark_line(clip=True, strokeWidth=3).transform_filter(
alt.datum["Rider"] == "J. VINGEGAARD"
)
+ base.mark_point(clip=True, size=100, filled=True, opacity=1).transform_filter(
alt.datum["Rider"] == "J. VINGEGAARD"
)
+ base.mark_text(clip=False, align="left", dx=20, fontSize=14)
.encode(text="labelText:N", y="labelY")
.transform_filter(alt.datum["Stage"] == laststage)
).properties(
title={
"text": "Classement Général Tour de France 2023",
"dy": 00,
"anchor": "middle",
"subtitle": "Graphic by Thomas Camminady",
"subtitleColor": "gray",
},
height=600,
)
Code
['J. VINGEGAARD',
'T. POGAČAR',
'A. YATES',
'C. RODRIGUEZ CANO',
'S. YATES',
'P. BILBAO LOPEZ',
'J. HINDLEY',
'F. GALL',
'S. KUSS',
'D. GAUDU']