In [70]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
plt.rcParams["figure.dpi"] = 120
In [71]:
URL = "https://raw.githubusercontent.com/VIS-SIG/Wonderful-Wednesdays/master/data/2026/2026-05-13/ADVS.csv"

df = pd.read_csv(URL)
df.head()
Out[71]:
STUDYID USUBJID SUBJID SITEID COUNTRY AGE SEX RACE TRT01P TRT01PN ... AVISITN PARAMCD PARAM AVAL BASE CHG PCHG ABLFL ANL01FL DTYPE
0 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE 1 ... 1 DIABP Diastolic Blood Pressure (mmHg) 109 109 NaN NaN Y Y NaN
1 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE 1 ... 2 DIABP Diastolic Blood Pressure (mmHg) 107 109 -2.0 -1.834862 NaN Y NaN
2 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE 1 ... 3 DIABP Diastolic Blood Pressure (mmHg) 108 109 -1.0 -0.917431 NaN Y NaN
3 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE 1 ... 4 DIABP Diastolic Blood Pressure (mmHg) 104 109 -5.0 -4.587156 NaN Y NaN
4 BP-RCT-001 SITE01-001 1 SITE01 GBR 70 M WHITE ACTIVE 1 ... 5 DIABP Diastolic Blood Pressure (mmHg) 102 109 -7.0 -6.422018 NaN Y NaN

5 rows × 30 columns

In [72]:
# Clean data
df["AVAL"] = pd.to_numeric(df["AVAL"], errors="coerce")
df["CHG"] = pd.to_numeric(df["CHG"], errors="coerce")
df["AVISITN"] = pd.to_numeric(df["AVISITN"], errors="coerce")
df["ADT"] = pd.to_datetime(df["ADT"], dayfirst=True, errors="coerce")

# Basic check
print(df.shape)
print(df["SITEID"].value_counts())
print(df.head())
(2320, 30)
SITEID
SITE03    420
SITE06    400
SITE04    394
SITE02    382
SITE05    374
SITE01    350
Name: count, dtype: int64
      STUDYID     USUBJID  SUBJID  SITEID COUNTRY  AGE SEX   RACE  TRT01P  \
0  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
1  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
2  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
3  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
4  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   

   TRT01PN  ... AVISITN  PARAMCD                            PARAM AVAL BASE  \
0        1  ...       1    DIABP  Diastolic Blood Pressure (mmHg)  109  109   
1        1  ...       2    DIABP  Diastolic Blood Pressure (mmHg)  107  109   
2        1  ...       3    DIABP  Diastolic Blood Pressure (mmHg)  108  109   
3        1  ...       4    DIABP  Diastolic Blood Pressure (mmHg)  104  109   
4        1  ...       5    DIABP  Diastolic Blood Pressure (mmHg)  102  109   

   CHG      PCHG ABLFL  ANL01FL DTYPE  
0  NaN       NaN     Y        Y   NaN  
1 -2.0 -1.834862   NaN        Y   NaN  
2 -1.0 -0.917431   NaN        Y   NaN  
3 -5.0 -4.587156   NaN        Y   NaN  
4 -7.0 -6.422018   NaN        Y   NaN  

[5 rows x 30 columns]

1. Digit preference: round numbers¶

In [73]:
df["last_digit"] = df["AVAL"].astype(int).abs() % 10
df["round_0_5"] = df["last_digit"].isin([0, 5])

digit_pref = (
    df.groupby("SITEID")["round_0_5"]
    .mean()
    .reset_index(name="proportion_round_0_or_5")
)

print(digit_pref)

plt.figure(figsize=(7,4))
plt.bar(digit_pref["SITEID"], digit_pref["proportion_round_0_or_5"])
plt.title("Digit Preference by Site")
plt.xlabel("Site")
plt.ylabel("Proportion of BP values ending in 0 or 5")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
   SITEID  proportion_round_0_or_5
0  SITE01                 0.242857
1  SITE02                 0.222513
2  SITE03                 0.821429
3  SITE04                 0.187817
4  SITE05                 0.181818
5  SITE06                 0.185000
No description has been provided for this image

if one site has many more values ending in 0 or 5, that suggests possible fabricated or rounded measurements.

2. Last digit distribution by site¶

In [74]:
digit_table = pd.crosstab(df["SITEID"], df["last_digit"], normalize="index")

print(digit_table)

plt.figure(figsize=(9,5))
plt.imshow(digit_table, aspect="auto")
plt.colorbar(label="Proportion")
plt.xticks(range(10), range(10))
plt.yticks(range(len(digit_table.index)), digit_table.index)
plt.title("Distribution of Final Digits by Site")
plt.xlabel("Final Digit of BP Measurement")
plt.ylabel("Site")
plt.tight_layout()
plt.show()
last_digit         0         1         2         3         4         5  \
SITEID                                                                   
SITE01      0.111429  0.091429  0.117143  0.077143  0.088571  0.131429   
SITE02      0.107330  0.070681  0.115183  0.091623  0.117801  0.115183   
SITE03      0.295238  0.023810  0.023810  0.033333  0.033333  0.526190   
SITE04      0.098985  0.109137  0.106599  0.076142  0.081218  0.088832   
SITE05      0.077540  0.101604  0.088235  0.098930  0.122995  0.104278   
SITE06      0.087500  0.090000  0.087500  0.085000  0.117500  0.097500   

last_digit         6         7         8         9  
SITEID                                              
SITE01      0.091429  0.102857  0.100000  0.088571  
SITE02      0.091623  0.086387  0.102094  0.102094  
SITE03      0.026190  0.011905  0.011905  0.014286  
SITE04      0.101523  0.126904  0.126904  0.083756  
SITE05      0.101604  0.106952  0.122995  0.074866  
SITE06      0.102500  0.120000  0.115000  0.097500  
No description has been provided for this image
In [75]:
digit_table = pd.crosstab(df["SITEID"], df["last_digit"], normalize="index").round(3)

plt.figure(figsize=(10, 4))
sns.heatmap(digit_table, annot=True, fmt=".2f", cmap="YlOrRd",
            linewidths=0.5, cbar_kws={"label": "Proportion"})
plt.title("Last digit distribution by site — SITE03 shows extreme preference for 0 and 5")
plt.xlabel("Last digit of BP value")
plt.tight_layout()
plt.show()
No description has been provided for this image

Story: real measurements should have a fairly mixed digit pattern. SITE03 should appear unusual here.

3. Variability: fake data is often too consistent¶

In [76]:
post_base = df[df["AVISITN"] > 1]

variability = (
    post_base.groupby(["SITEID", "PARAMCD"])["CHG"]
    .std()
    .reset_index(name="sd_change")
)

print(variability)

for param in ["SYSBP", "DIABP"]:
    temp = variability[variability["PARAMCD"] == param]

    plt.figure(figsize=(7,4))
    plt.bar(temp["SITEID"], temp["sd_change"])
    plt.title(f"Variability in Change from Baseline: {param}")
    plt.xlabel("Site")
    plt.ylabel("Standard deviation of change")
    plt.grid(axis="y", alpha=0.3)
    plt.tight_layout()
    plt.show()
    SITEID PARAMCD  sd_change
0   SITE01   DIABP   4.962773
1   SITE01   SYSBP   7.369588
2   SITE02   DIABP   5.630034
3   SITE02   SYSBP   9.200822
4   SITE03   DIABP   3.327271
5   SITE03   SYSBP   5.223485
6   SITE04   DIABP   5.778665
7   SITE04   SYSBP   8.947973
8   SITE05   DIABP   6.394070
9   SITE05   SYSBP   7.268345
10  SITE06   DIABP   5.131955
11  SITE06   SYSBP   7.276693
No description has been provided for this image
No description has been provided for this image

Story: suspicious fabricated data often has lower variability than real patient data.

4. Missing data / dropout¶

In [77]:
visits = (
    df.drop_duplicates(["SITEID", "USUBJID", "AVISITN", "ADT"])
    .sort_values(["SITEID", "USUBJID", "AVISITN"])
)

visits_per_subject = (
    visits.groupby(["SITEID", "USUBJID"])["AVISITN"]
    .nunique()
    .reset_index(name="n_visits")
)

completion = (
    visits_per_subject.groupby("SITEID")
    .agg(
        mean_visits=("n_visits", "mean"),
        full_completion=("n_visits", lambda x: (x == 7).mean())
    )
    .reset_index()
)

print(completion)

plt.figure(figsize=(7,4))
plt.bar(completion["SITEID"], completion["full_completion"])
plt.title("Proportion of Subjects With Complete Visits")
plt.xlabel("Site")
plt.ylabel("Proportion with all 7 visits")
plt.ylim(0, 1.05)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
   SITEID  mean_visits  full_completion
0  SITE01     5.833333         0.666667
1  SITE02     6.366667         0.766667
2  SITE03     7.000000         1.000000
3  SITE04     6.566667         0.700000
4  SITE05     6.233333         0.700000
5  SITE06     6.666667         0.833333
No description has been provided for this image

SITE03 having every subject complete all visits would be suspicious because real trials usually have missed visits/dropouts.

5. Visit regularity¶

In [78]:
visits["days_between"] = (
    visits.groupby(["SITEID", "USUBJID"])["ADT"]
    .diff()
    .dt.days
)

intervals = visits.dropna(subset=["days_between"])

interval_summary = (
    intervals.groupby("SITEID")["days_between"]
    .agg(["mean", "std", "min", "max"])
    .reset_index()
)

print(interval_summary)

plt.figure(figsize=(7,4))
plt.bar(interval_summary["SITEID"], interval_summary["std"])
plt.title("Visit Timing Variability by Site")
plt.xlabel("Site")
plt.ylabel("Standard deviation of days between visits")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
   SITEID       mean       std   min   max
0  SITE01  28.124138  2.938922  23.0  34.0
1  SITE02  27.987578  2.537192  22.0  34.0
2  SITE03  28.000000  0.000000  28.0  28.0
3  SITE04  27.976048  2.744001  22.0  34.0
4  SITE05  27.834395  3.094344  22.0  34.0
5  SITE06  27.894118  2.782122  22.0  34.0
No description has been provided for this image

Story: in real life, visit intervals vary. If a site has almost zero variation, that is suspicious.

In [79]:
suspect_site = "SITE03"

for param in ["SYSBP", "DIABP"]:
    temp = df[(df["SITEID"] == suspect_site) & (df["PARAMCD"] == param)]

    plt.figure(figsize=(8,5))

    for subject, group in temp.groupby("USUBJID"):
        group = group.sort_values("AVISITN")
        plt.plot(group["AVISITN"], group["AVAL"], alpha=0.4)

    plt.title(f"Patient BP Trajectories at {suspect_site}: {param}")
    plt.xlabel("Visit")
    plt.ylabel("Blood Pressure")
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
In [80]:
fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharey=False)
fig.suptitle("Patient BP Trajectories by Site — DIABP", fontsize=14, fontweight="bold")

for ax, site in zip(axes.flatten(), sites):
    site_data = df[(df["SITEID"] == site) & (df["PARAMCD"] == "DIABP")]
    for subject, group in site_data.groupby("USUBJID"):
        group = group.sort_values("AVISITN")
        color = "#E24B4A" if site == "SITE03" else "#1D9E75"
        ax.plot(group["AVISITN"], group["AVAL"], alpha=0.4, color=color, linewidth=1)

    ax.set_title(site, fontweight="bold",
                 color="#E24B4A" if site == "SITE03" else "black")
    ax.set_xlabel("Visit")
    ax.set_ylabel("Diastolic BP (mmHg)")
    ax.grid(alpha=0.3)
    ax.spines[["top", "right"]].set_visible(False)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [81]:
fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharey=False)
fig.suptitle("Patient BP Trajectories by Site — SYSBP", fontsize=14, fontweight="bold")

for ax, site in zip(axes.flatten(), sites):
    site_data = df[(df["SITEID"] == site) & (df["PARAMCD"] == "SYSBP")]
    for subject, group in site_data.groupby("USUBJID"):
        group = group.sort_values("AVISITN")
        color = "#E24B4A" if site == "SITE03" else "#1D9E75"
        ax.plot(group["AVISITN"], group["AVAL"], alpha=0.4, color=color, linewidth=1)

    ax.set_title(site, fontweight="bold",
                 color="#E24B4A" if site == "SITE03" else "black")
    ax.set_xlabel("Visit")
    ax.set_ylabel("Systolic BP (mmHg)")
    ax.grid(alpha=0.3)
    ax.spines[["top", "right"]].set_visible(False)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [82]:
# Digit preference
digit_pref = (
    df.groupby("SITEID")["round_0_5"]
    .mean()
    .reset_index(name="round_digit_rate")
)

# Variability
site_var = (
    post_base.groupby("SITEID")["CHG"]
    .std()
    .reset_index(name="change_sd")
)

# Completion
completion = completion[["SITEID", "full_completion"]]

# Visit timing variability
interval_sd = interval_summary[["SITEID", "std"]].rename(columns={"std": "visit_interval_sd"})

# Combine
fraud_indicators = (
    digit_pref
    .merge(site_var, on="SITEID")
    .merge(completion, on="SITEID")
    .merge(interval_sd, on="SITEID")
)

print(fraud_indicators)
   SITEID  round_digit_rate  change_sd  full_completion  visit_interval_sd
0  SITE01          0.242857   6.277419         0.666667           2.938922
1  SITE02          0.222513   7.831257         0.766667           2.537192
2  SITE03          0.821429   4.621213         1.000000           0.000000
3  SITE04          0.187817   7.634920         0.700000           2.744001
4  SITE05          0.181818   6.835951         0.700000           3.094344
5  SITE06          0.185000   6.339367         0.833333           2.782122
In [83]:
# Map your existing variables to the dashboard
sites = fraud_indicators["SITEID"].tolist()
round_vals  = fraud_indicators.set_index("SITEID")["round_digit_rate"] * 100
sd_vals     = fraud_indicators.set_index("SITEID")["change_sd"]
comp_vals   = fraud_indicators.set_index("SITEID")["full_completion"] * 100
int_vals    = fraud_indicators.set_index("SITEID")["visit_interval_sd"]

fraud_site = "SITE03"
colors = ["#E24B4A" if s == fraud_site else "#1D9E75" for s in sites]

import matplotlib.gridspec as gridspec

fig = plt.figure(figsize=(14, 10), facecolor="white")
fig.suptitle("Clinical Trial Fraud Detection — Site-Level Analysis",
             fontsize=15, fontweight="bold", y=0.98)

gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.45, wspace=0.35)

# Panel 1: Digit preference
ax1 = fig.add_subplot(gs[0, 0])
ax1.bar(sites, round_vals[sites], color=colors, edgecolor="white")
ax1.set_ylabel("% values ending in 0 or 5")
ax1.set_title("Digit preference", fontweight="bold")
ax1.set_ylim(0, 100)
for i, (s, v) in enumerate(zip(sites, round_vals[sites])):
    ax1.text(i, v + 1, f"{v:.0f}%", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax1.spines[["top", "right"]].set_visible(False)

# Panel 2: CHG variability
ax2 = fig.add_subplot(gs[0, 1])
ax2.bar(sites, sd_vals[sites], color=colors, edgecolor="white")
ax2.set_ylabel("SD of CHG (mmHg)")
ax2.set_title("CHG variability", fontweight="bold")
ax2.set_ylim(0, 12)
for i, (s, v) in enumerate(zip(sites, sd_vals[sites])):
    ax2.text(i, v + 0.15, f"{v:.1f}", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax2.spines[["top", "right"]].set_visible(False)

# Panel 3: Completion rate
ax3 = fig.add_subplot(gs[1, 0])
ax3.bar(sites, comp_vals[sites], color=colors, edgecolor="white")
ax3.set_ylabel("% subjects completing all visits")
ax3.set_title("Visit completion rate", fontweight="bold")
ax3.set_ylim(0, 115)
for i, (s, v) in enumerate(zip(sites, comp_vals[sites])):
    ax3.text(i, v + 1, f"{v:.0f}%", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax3.spines[["top", "right"]].set_visible(False)

# Panel 4: Visit interval SD
ax4 = fig.add_subplot(gs[1, 1])
ax4.bar(sites, int_vals[sites], color=colors, edgecolor="white")
ax4.set_ylabel("SD of days between visits")
ax4.set_title("Visit scheduling variability", fontweight="bold")
ax4.set_ylim(0, 4.5)
for i, (s, v) in enumerate(zip(sites, int_vals[sites])):
    ax4.text(i, v + 0.05, f"{v:.2f}", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax4.spines[["top", "right"]].set_visible(False)

# Verdict banner
fig.text(0.5, 0.01,
    "⚠  SITE03 flagged: 82% digit preference  |  100% completion  |  "
    "0-day visit interval SD  |  Lowest CHG variability — recommend audit",
    ha="center", fontsize=10, color="#A32D2D",
    bbox=dict(boxstyle="round,pad=0.4", facecolor="#FCEBEB",
              edgecolor="#E24B4A", linewidth=1.5))

plt.savefig("fraud_detection.png", dpi=150, bbox_inches="tight")
plt.show()
No description has been provided for this image

Summary dashboard¶

Four independent statistical indicators are examined across all six sites. SITE03 (red) is a clear outlier on every measure simultaneously.

Overall suspicion score¶

Each indicator is standardised to a z-score (mean = 0, SD = 1) so they can be combined on the same scale. Positive = more suspicious than average, negative = less suspicious. The four z-scores are summed into one composite score. No legitimate site fails all four checks — SITE03 does.

In [67]:
score = fraud_indicators.copy()

# Higher round_digit_rate = more suspicious
score["round_score"] = (
    (score["round_digit_rate"] - score["round_digit_rate"].mean())
    / score["round_digit_rate"].std()
)

# Lower variability = more suspicious
score["low_variability_score"] = (
    (score["change_sd"].mean() - score["change_sd"])
    / score["change_sd"].std()
)

# Higher completion = more suspicious
score["completion_score"] = (
    (score["full_completion"] - score["full_completion"].mean())
    / score["full_completion"].std()
)

# Lower visit interval SD = more suspicious
score["regular_visit_score"] = (
    (score["visit_interval_sd"].mean() - score["visit_interval_sd"])
    / score["visit_interval_sd"].std()
)

score["overall_suspicion_score"] = (
    score["round_score"]
    + score["low_variability_score"]
    + score["completion_score"]
    + score["regular_visit_score"]
)

score = score.sort_values("overall_suspicion_score", ascending=False)

print(score[[
    "SITEID",
    "round_digit_rate",
    "change_sd",
    "full_completion",
    "visit_interval_sd",
    "overall_suspicion_score"
]])


bar_colors = ['#E24B4A' if s == 'SITE03' else '#1D9E75'
              for s in score["SITEID"]]

plt.figure(figsize=(7, 4))
plt.bar(score["SITEID"], score["overall_suspicion_score"], color=bar_colors)
plt.title("Overall suspicion score by site")
plt.xlabel("Site")
plt.ylabel("Suspicion score (z-score composite)")
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()
   SITEID  round_digit_rate  change_sd  full_completion  visit_interval_sd  \
2  SITE03          0.821429   4.621213         1.000000           0.000000   
5  SITE06          0.185000   6.339367         0.833333           2.782122   
0  SITE01          0.242857   6.277419         0.666667           2.938922   
1  SITE02          0.222513   7.831257         0.766667           2.537192   
4  SITE05          0.181818   6.835951         0.700000           3.094344   
3  SITE04          0.187817   7.634920         0.700000           2.744001   

   overall_suspicion_score  
2                 7.533657  
5                -0.188782  
0                -1.384082  
1                -1.653663  
4                -1.971274  
3                -2.335856  
No description has been provided for this image

Conclusion¶

SITE03 is flagged as the suspected fraudulent site based on four independent indicators:

  • 82% of BP values end in 0 or 5 (vs ~19% at all other sites)
  • 0% dropout — all 30 subjects completed all 7 visits (implausible in a real trial)
  • Visit interval SD = 0 — every visit on the exact scheduled day (impossible at scale)
  • Lowest CHG variability (SD = 4.6 vs 6.3–7.8 elsewhere)

No legitimate site fails all four checks simultaneously. Recommend audit of SITE03.

Create HTML¶

In [68]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [69]:
path = "/content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.ipynb"
!jupyter nbconvert --to html --embed-images "{path}"

html_path = path.replace(".ipynb", ".html")
from google.colab import files
files.download(html_path)
[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 13 image(s).
[NbConvertApp] Writing 2986897 bytes to /content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.html
In [ ]: