import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.rcParams["figure.dpi"] = 120
URL = "https://raw.githubusercontent.com/VIS-SIG/Wonderful-Wednesdays/master/data/2026/2026-05-13/ADVS.csv"
df = pd.read_csv(URL)
df.head()
| STUDYID | USUBJID | SUBJID | SITEID | COUNTRY | AGE | SEX | RACE | TRT01P | TRT01PN | ... | AVISITN | PARAMCD | PARAM | AVAL | BASE | CHG | PCHG | ABLFL | ANL01FL | DTYPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BP-RCT-001 | SITE01-001 | 1 | SITE01 | GBR | 70 | M | WHITE | ACTIVE | 1 | ... | 1 | DIABP | Diastolic Blood Pressure (mmHg) | 109 | 109 | NaN | NaN | Y | Y | NaN |
| 1 | BP-RCT-001 | SITE01-001 | 1 | SITE01 | GBR | 70 | M | WHITE | ACTIVE | 1 | ... | 2 | DIABP | Diastolic Blood Pressure (mmHg) | 107 | 109 | -2.0 | -1.834862 | NaN | Y | NaN |
| 2 | BP-RCT-001 | SITE01-001 | 1 | SITE01 | GBR | 70 | M | WHITE | ACTIVE | 1 | ... | 3 | DIABP | Diastolic Blood Pressure (mmHg) | 108 | 109 | -1.0 | -0.917431 | NaN | Y | NaN |
| 3 | BP-RCT-001 | SITE01-001 | 1 | SITE01 | GBR | 70 | M | WHITE | ACTIVE | 1 | ... | 4 | DIABP | Diastolic Blood Pressure (mmHg) | 104 | 109 | -5.0 | -4.587156 | NaN | Y | NaN |
| 4 | BP-RCT-001 | SITE01-001 | 1 | SITE01 | GBR | 70 | M | WHITE | ACTIVE | 1 | ... | 5 | DIABP | Diastolic Blood Pressure (mmHg) | 102 | 109 | -7.0 | -6.422018 | NaN | Y | NaN |
5 rows × 30 columns
# Clean data
df["AVAL"] = pd.to_numeric(df["AVAL"], errors="coerce")
df["CHG"] = pd.to_numeric(df["CHG"], errors="coerce")
df["AVISITN"] = pd.to_numeric(df["AVISITN"], errors="coerce")
df["ADT"] = pd.to_datetime(df["ADT"], dayfirst=True, errors="coerce")
# Basic check
print(df.shape)
print(df["SITEID"].value_counts())
print(df.head())
(2320, 30)
SITEID
SITE03 420
SITE06 400
SITE04 394
SITE02 382
SITE05 374
SITE01 350
Name: count, dtype: int64
STUDYID USUBJID SUBJID SITEID COUNTRY AGE SEX RACE TRT01P \
0 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE
1 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE
2 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE
3 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE
4 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE
TRT01PN ... AVISITN PARAMCD PARAM AVAL BASE \
0 1 ... 1 DIABP Diastolic Blood Pressure (mmHg) 109 109
1 1 ... 2 DIABP Diastolic Blood Pressure (mmHg) 107 109
2 1 ... 3 DIABP Diastolic Blood Pressure (mmHg) 108 109
3 1 ... 4 DIABP Diastolic Blood Pressure (mmHg) 104 109
4 1 ... 5 DIABP Diastolic Blood Pressure (mmHg) 102 109
CHG PCHG ABLFL ANL01FL DTYPE
0 NaN NaN Y Y NaN
1 -2.0 -1.834862 NaN Y NaN
2 -1.0 -0.917431 NaN Y NaN
3 -5.0 -4.587156 NaN Y NaN
4 -7.0 -6.422018 NaN Y NaN
[5 rows x 30 columns]
1. Digit preference: round numbers¶
df["last_digit"] = df["AVAL"].astype(int).abs() % 10
df["round_0_5"] = df["last_digit"].isin([0, 5])
digit_pref = (
df.groupby("SITEID")["round_0_5"]
.mean()
.reset_index(name="proportion_round_0_or_5")
)
print(digit_pref)
plt.figure(figsize=(7,4))
plt.bar(digit_pref["SITEID"], digit_pref["proportion_round_0_or_5"])
plt.title("Digit Preference by Site")
plt.xlabel("Site")
plt.ylabel("Proportion of BP values ending in 0 or 5")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
SITEID proportion_round_0_or_5 0 SITE01 0.242857 1 SITE02 0.222513 2 SITE03 0.821429 3 SITE04 0.187817 4 SITE05 0.181818 5 SITE06 0.185000
if one site has many more values ending in 0 or 5, that suggests possible fabricated or rounded measurements.
2. Last digit distribution by site¶
digit_table = pd.crosstab(df["SITEID"], df["last_digit"], normalize="index")
print(digit_table)
plt.figure(figsize=(9,5))
plt.imshow(digit_table, aspect="auto")
plt.colorbar(label="Proportion")
plt.xticks(range(10), range(10))
plt.yticks(range(len(digit_table.index)), digit_table.index)
plt.title("Distribution of Final Digits by Site")
plt.xlabel("Final Digit of BP Measurement")
plt.ylabel("Site")
plt.tight_layout()
plt.show()
last_digit 0 1 2 3 4 5 \ SITEID SITE01 0.111429 0.091429 0.117143 0.077143 0.088571 0.131429 SITE02 0.107330 0.070681 0.115183 0.091623 0.117801 0.115183 SITE03 0.295238 0.023810 0.023810 0.033333 0.033333 0.526190 SITE04 0.098985 0.109137 0.106599 0.076142 0.081218 0.088832 SITE05 0.077540 0.101604 0.088235 0.098930 0.122995 0.104278 SITE06 0.087500 0.090000 0.087500 0.085000 0.117500 0.097500 last_digit 6 7 8 9 SITEID SITE01 0.091429 0.102857 0.100000 0.088571 SITE02 0.091623 0.086387 0.102094 0.102094 SITE03 0.026190 0.011905 0.011905 0.014286 SITE04 0.101523 0.126904 0.126904 0.083756 SITE05 0.101604 0.106952 0.122995 0.074866 SITE06 0.102500 0.120000 0.115000 0.097500
digit_table = pd.crosstab(df["SITEID"], df["last_digit"], normalize="index").round(3)
plt.figure(figsize=(10, 4))
sns.heatmap(digit_table, annot=True, fmt=".2f", cmap="YlOrRd",
linewidths=0.5, cbar_kws={"label": "Proportion"})
plt.title("Last digit distribution by site — SITE03 shows extreme preference for 0 and 5")
plt.xlabel("Last digit of BP value")
plt.tight_layout()
plt.show()
Story: real measurements should have a fairly mixed digit pattern. SITE03 should appear unusual here.
3. Variability: fake data is often too consistent¶
post_base = df[df["AVISITN"] > 1]
variability = (
post_base.groupby(["SITEID", "PARAMCD"])["CHG"]
.std()
.reset_index(name="sd_change")
)
print(variability)
for param in ["SYSBP", "DIABP"]:
temp = variability[variability["PARAMCD"] == param]
plt.figure(figsize=(7,4))
plt.bar(temp["SITEID"], temp["sd_change"])
plt.title(f"Variability in Change from Baseline: {param}")
plt.xlabel("Site")
plt.ylabel("Standard deviation of change")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
SITEID PARAMCD sd_change 0 SITE01 DIABP 4.962773 1 SITE01 SYSBP 7.369588 2 SITE02 DIABP 5.630034 3 SITE02 SYSBP 9.200822 4 SITE03 DIABP 3.327271 5 SITE03 SYSBP 5.223485 6 SITE04 DIABP 5.778665 7 SITE04 SYSBP 8.947973 8 SITE05 DIABP 6.394070 9 SITE05 SYSBP 7.268345 10 SITE06 DIABP 5.131955 11 SITE06 SYSBP 7.276693
Story: suspicious fabricated data often has lower variability than real patient data.
4. Missing data / dropout¶
visits = (
df.drop_duplicates(["SITEID", "USUBJID", "AVISITN", "ADT"])
.sort_values(["SITEID", "USUBJID", "AVISITN"])
)
visits_per_subject = (
visits.groupby(["SITEID", "USUBJID"])["AVISITN"]
.nunique()
.reset_index(name="n_visits")
)
completion = (
visits_per_subject.groupby("SITEID")
.agg(
mean_visits=("n_visits", "mean"),
full_completion=("n_visits", lambda x: (x == 7).mean())
)
.reset_index()
)
print(completion)
plt.figure(figsize=(7,4))
plt.bar(completion["SITEID"], completion["full_completion"])
plt.title("Proportion of Subjects With Complete Visits")
plt.xlabel("Site")
plt.ylabel("Proportion with all 7 visits")
plt.ylim(0, 1.05)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
SITEID mean_visits full_completion 0 SITE01 5.833333 0.666667 1 SITE02 6.366667 0.766667 2 SITE03 7.000000 1.000000 3 SITE04 6.566667 0.700000 4 SITE05 6.233333 0.700000 5 SITE06 6.666667 0.833333
SITE03 having every subject complete all visits would be suspicious because real trials usually have missed visits/dropouts.
5. Visit regularity¶
visits["days_between"] = (
visits.groupby(["SITEID", "USUBJID"])["ADT"]
.diff()
.dt.days
)
intervals = visits.dropna(subset=["days_between"])
interval_summary = (
intervals.groupby("SITEID")["days_between"]
.agg(["mean", "std", "min", "max"])
.reset_index()
)
print(interval_summary)
plt.figure(figsize=(7,4))
plt.bar(interval_summary["SITEID"], interval_summary["std"])
plt.title("Visit Timing Variability by Site")
plt.xlabel("Site")
plt.ylabel("Standard deviation of days between visits")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
SITEID mean std min max 0 SITE01 28.124138 2.938922 23.0 34.0 1 SITE02 27.987578 2.537192 22.0 34.0 2 SITE03 28.000000 0.000000 28.0 28.0 3 SITE04 27.976048 2.744001 22.0 34.0 4 SITE05 27.834395 3.094344 22.0 34.0 5 SITE06 27.894118 2.782122 22.0 34.0
Story: in real life, visit intervals vary. If a site has almost zero variation, that is suspicious.
suspect_site = "SITE03"
for param in ["SYSBP", "DIABP"]:
temp = df[(df["SITEID"] == suspect_site) & (df["PARAMCD"] == param)]
plt.figure(figsize=(8,5))
for subject, group in temp.groupby("USUBJID"):
group = group.sort_values("AVISITN")
plt.plot(group["AVISITN"], group["AVAL"], alpha=0.4)
plt.title(f"Patient BP Trajectories at {suspect_site}: {param}")
plt.xlabel("Visit")
plt.ylabel("Blood Pressure")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharey=False)
fig.suptitle("Patient BP Trajectories by Site — DIABP", fontsize=14, fontweight="bold")
for ax, site in zip(axes.flatten(), sites):
site_data = df[(df["SITEID"] == site) & (df["PARAMCD"] == "DIABP")]
for subject, group in site_data.groupby("USUBJID"):
group = group.sort_values("AVISITN")
color = "#E24B4A" if site == "SITE03" else "#1D9E75"
ax.plot(group["AVISITN"], group["AVAL"], alpha=0.4, color=color, linewidth=1)
ax.set_title(site, fontweight="bold",
color="#E24B4A" if site == "SITE03" else "black")
ax.set_xlabel("Visit")
ax.set_ylabel("Diastolic BP (mmHg)")
ax.grid(alpha=0.3)
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharey=False)
fig.suptitle("Patient BP Trajectories by Site — SYSBP", fontsize=14, fontweight="bold")
for ax, site in zip(axes.flatten(), sites):
site_data = df[(df["SITEID"] == site) & (df["PARAMCD"] == "SYSBP")]
for subject, group in site_data.groupby("USUBJID"):
group = group.sort_values("AVISITN")
color = "#E24B4A" if site == "SITE03" else "#1D9E75"
ax.plot(group["AVISITN"], group["AVAL"], alpha=0.4, color=color, linewidth=1)
ax.set_title(site, fontweight="bold",
color="#E24B4A" if site == "SITE03" else "black")
ax.set_xlabel("Visit")
ax.set_ylabel("Systolic BP (mmHg)")
ax.grid(alpha=0.3)
ax.spines[["top", "right"]].set_visible(False)
plt.tight_layout()
plt.show()
# Digit preference
digit_pref = (
df.groupby("SITEID")["round_0_5"]
.mean()
.reset_index(name="round_digit_rate")
)
# Variability
site_var = (
post_base.groupby("SITEID")["CHG"]
.std()
.reset_index(name="change_sd")
)
# Completion
completion = completion[["SITEID", "full_completion"]]
# Visit timing variability
interval_sd = interval_summary[["SITEID", "std"]].rename(columns={"std": "visit_interval_sd"})
# Combine
fraud_indicators = (
digit_pref
.merge(site_var, on="SITEID")
.merge(completion, on="SITEID")
.merge(interval_sd, on="SITEID")
)
print(fraud_indicators)
SITEID round_digit_rate change_sd full_completion visit_interval_sd 0 SITE01 0.242857 6.277419 0.666667 2.938922 1 SITE02 0.222513 7.831257 0.766667 2.537192 2 SITE03 0.821429 4.621213 1.000000 0.000000 3 SITE04 0.187817 7.634920 0.700000 2.744001 4 SITE05 0.181818 6.835951 0.700000 3.094344 5 SITE06 0.185000 6.339367 0.833333 2.782122
# Map your existing variables to the dashboard
sites = fraud_indicators["SITEID"].tolist()
round_vals = fraud_indicators.set_index("SITEID")["round_digit_rate"] * 100
sd_vals = fraud_indicators.set_index("SITEID")["change_sd"]
comp_vals = fraud_indicators.set_index("SITEID")["full_completion"] * 100
int_vals = fraud_indicators.set_index("SITEID")["visit_interval_sd"]
fraud_site = "SITE03"
colors = ["#E24B4A" if s == fraud_site else "#1D9E75" for s in sites]
import matplotlib.gridspec as gridspec
fig = plt.figure(figsize=(14, 10), facecolor="white")
fig.suptitle("Clinical Trial Fraud Detection — Site-Level Analysis",
fontsize=15, fontweight="bold", y=0.98)
gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.45, wspace=0.35)
# Panel 1: Digit preference
ax1 = fig.add_subplot(gs[0, 0])
ax1.bar(sites, round_vals[sites], color=colors, edgecolor="white")
ax1.set_ylabel("% values ending in 0 or 5")
ax1.set_title("Digit preference", fontweight="bold")
ax1.set_ylim(0, 100)
for i, (s, v) in enumerate(zip(sites, round_vals[sites])):
ax1.text(i, v + 1, f"{v:.0f}%", ha="center", fontsize=9,
color="#E24B4A" if s == fraud_site else "#444")
ax1.spines[["top", "right"]].set_visible(False)
# Panel 2: CHG variability
ax2 = fig.add_subplot(gs[0, 1])
ax2.bar(sites, sd_vals[sites], color=colors, edgecolor="white")
ax2.set_ylabel("SD of CHG (mmHg)")
ax2.set_title("CHG variability", fontweight="bold")
ax2.set_ylim(0, 12)
for i, (s, v) in enumerate(zip(sites, sd_vals[sites])):
ax2.text(i, v + 0.15, f"{v:.1f}", ha="center", fontsize=9,
color="#E24B4A" if s == fraud_site else "#444")
ax2.spines[["top", "right"]].set_visible(False)
# Panel 3: Completion rate
ax3 = fig.add_subplot(gs[1, 0])
ax3.bar(sites, comp_vals[sites], color=colors, edgecolor="white")
ax3.set_ylabel("% subjects completing all visits")
ax3.set_title("Visit completion rate", fontweight="bold")
ax3.set_ylim(0, 115)
for i, (s, v) in enumerate(zip(sites, comp_vals[sites])):
ax3.text(i, v + 1, f"{v:.0f}%", ha="center", fontsize=9,
color="#E24B4A" if s == fraud_site else "#444")
ax3.spines[["top", "right"]].set_visible(False)
# Panel 4: Visit interval SD
ax4 = fig.add_subplot(gs[1, 1])
ax4.bar(sites, int_vals[sites], color=colors, edgecolor="white")
ax4.set_ylabel("SD of days between visits")
ax4.set_title("Visit scheduling variability", fontweight="bold")
ax4.set_ylim(0, 4.5)
for i, (s, v) in enumerate(zip(sites, int_vals[sites])):
ax4.text(i, v + 0.05, f"{v:.2f}", ha="center", fontsize=9,
color="#E24B4A" if s == fraud_site else "#444")
ax4.spines[["top", "right"]].set_visible(False)
# Verdict banner
fig.text(0.5, 0.01,
"⚠ SITE03 flagged: 82% digit preference | 100% completion | "
"0-day visit interval SD | Lowest CHG variability — recommend audit",
ha="center", fontsize=10, color="#A32D2D",
bbox=dict(boxstyle="round,pad=0.4", facecolor="#FCEBEB",
edgecolor="#E24B4A", linewidth=1.5))
plt.savefig("fraud_detection.png", dpi=150, bbox_inches="tight")
plt.show()
Summary dashboard¶
Four independent statistical indicators are examined across all six sites. SITE03 (red) is a clear outlier on every measure simultaneously.
Overall suspicion score¶
Each indicator is standardised to a z-score (mean = 0, SD = 1) so they can be combined on the same scale. Positive = more suspicious than average, negative = less suspicious. The four z-scores are summed into one composite score. No legitimate site fails all four checks — SITE03 does.
score = fraud_indicators.copy()
# Higher round_digit_rate = more suspicious
score["round_score"] = (
(score["round_digit_rate"] - score["round_digit_rate"].mean())
/ score["round_digit_rate"].std()
)
# Lower variability = more suspicious
score["low_variability_score"] = (
(score["change_sd"].mean() - score["change_sd"])
/ score["change_sd"].std()
)
# Higher completion = more suspicious
score["completion_score"] = (
(score["full_completion"] - score["full_completion"].mean())
/ score["full_completion"].std()
)
# Lower visit interval SD = more suspicious
score["regular_visit_score"] = (
(score["visit_interval_sd"].mean() - score["visit_interval_sd"])
/ score["visit_interval_sd"].std()
)
score["overall_suspicion_score"] = (
score["round_score"]
+ score["low_variability_score"]
+ score["completion_score"]
+ score["regular_visit_score"]
)
score = score.sort_values("overall_suspicion_score", ascending=False)
print(score[[
"SITEID",
"round_digit_rate",
"change_sd",
"full_completion",
"visit_interval_sd",
"overall_suspicion_score"
]])
bar_colors = ['#E24B4A' if s == 'SITE03' else '#1D9E75'
for s in score["SITEID"]]
plt.figure(figsize=(7, 4))
plt.bar(score["SITEID"], score["overall_suspicion_score"], color=bar_colors)
plt.title("Overall suspicion score by site")
plt.xlabel("Site")
plt.ylabel("Suspicion score (z-score composite)")
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
SITEID round_digit_rate change_sd full_completion visit_interval_sd \ 2 SITE03 0.821429 4.621213 1.000000 0.000000 5 SITE06 0.185000 6.339367 0.833333 2.782122 0 SITE01 0.242857 6.277419 0.666667 2.938922 1 SITE02 0.222513 7.831257 0.766667 2.537192 4 SITE05 0.181818 6.835951 0.700000 3.094344 3 SITE04 0.187817 7.634920 0.700000 2.744001 overall_suspicion_score 2 7.533657 5 -0.188782 0 -1.384082 1 -1.653663 4 -1.971274 3 -2.335856
Conclusion¶
SITE03 is flagged as the suspected fraudulent site based on four independent indicators:
- 82% of BP values end in 0 or 5 (vs ~19% at all other sites)
- 0% dropout — all 30 subjects completed all 7 visits (implausible in a real trial)
- Visit interval SD = 0 — every visit on the exact scheduled day (impossible at scale)
- Lowest CHG variability (SD = 4.6 vs 6.3–7.8 elsewhere)
No legitimate site fails all four checks simultaneously. Recommend audit of SITE03.
Create HTML¶
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
path = "/content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.ipynb"
!jupyter nbconvert --to html --embed-images "{path}"
html_path = path.replace(".ipynb", ".html")
from google.colab import files
files.download(html_path)
[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.ipynb to html [NbConvertApp] WARNING | Alternative text is missing on 13 image(s). [NbConvertApp] Writing 2986897 bytes to /content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.html