import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
plt.rcParams["figure.dpi"] = 120

URL = "https://raw.githubusercontent.com/VIS-SIG/Wonderful-Wednesdays/master/data/2026/2026-05-13/ADVS.csv"

df = pd.read_csv(URL)
df.head()

# Clean data
df["AVAL"] = pd.to_numeric(df["AVAL"], errors="coerce")
df["CHG"] = pd.to_numeric(df["CHG"], errors="coerce")
df["AVISITN"] = pd.to_numeric(df["AVISITN"], errors="coerce")
df["ADT"] = pd.to_datetime(df["ADT"], dayfirst=True, errors="coerce")

# Basic check
print(df.shape)
print(df["SITEID"].value_counts())
print(df.head())

(2320, 30)
SITEID
SITE03    420
SITE06    400
SITE04    394
SITE02    382
SITE05    374
SITE01    350
Name: count, dtype: int64
      STUDYID     USUBJID  SUBJID  SITEID COUNTRY  AGE SEX   RACE  TRT01P  \
0  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
1  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
2  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
3  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   
4  BP-RCT-001  SITE01-001       1  SITE01     GBR   70   M  WHITE  ACTIVE   

   TRT01PN  ... AVISITN  PARAMCD                            PARAM AVAL BASE  \
0        1  ...       1    DIABP  Diastolic Blood Pressure (mmHg)  109  109   
1        1  ...       2    DIABP  Diastolic Blood Pressure (mmHg)  107  109   
2        1  ...       3    DIABP  Diastolic Blood Pressure (mmHg)  108  109   
3        1  ...       4    DIABP  Diastolic Blood Pressure (mmHg)  104  109   
4        1  ...       5    DIABP  Diastolic Blood Pressure (mmHg)  102  109   

   CHG      PCHG ABLFL  ANL01FL DTYPE  
0  NaN       NaN     Y        Y   NaN  
1 -2.0 -1.834862   NaN        Y   NaN  
2 -1.0 -0.917431   NaN        Y   NaN  
3 -5.0 -4.587156   NaN        Y   NaN  
4 -7.0 -6.422018   NaN        Y   NaN  

[5 rows x 30 columns]

df["last_digit"] = df["AVAL"].astype(int).abs() % 10
df["round_0_5"] = df["last_digit"].isin([0, 5])

digit_pref = (
    df.groupby("SITEID")["round_0_5"]
    .mean()
    .reset_index(name="proportion_round_0_or_5")
)

print(digit_pref)

plt.figure(figsize=(7,4))
plt.bar(digit_pref["SITEID"], digit_pref["proportion_round_0_or_5"])
plt.title("Digit Preference by Site")
plt.xlabel("Site")
plt.ylabel("Proportion of BP values ending in 0 or 5")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

   SITEID  proportion_round_0_or_5
0  SITE01                 0.242857
1  SITE02                 0.222513
2  SITE03                 0.821429
3  SITE04                 0.187817
4  SITE05                 0.181818
5  SITE06                 0.185000

digit_table = pd.crosstab(df["SITEID"], df["last_digit"], normalize="index")

print(digit_table)

plt.figure(figsize=(9,5))
plt.imshow(digit_table, aspect="auto")
plt.colorbar(label="Proportion")
plt.xticks(range(10), range(10))
plt.yticks(range(len(digit_table.index)), digit_table.index)
plt.title("Distribution of Final Digits by Site")
plt.xlabel("Final Digit of BP Measurement")
plt.ylabel("Site")
plt.tight_layout()
plt.show()

last_digit         0         1         2         3         4         5  \
SITEID                                                                   
SITE01      0.111429  0.091429  0.117143  0.077143  0.088571  0.131429   
SITE02      0.107330  0.070681  0.115183  0.091623  0.117801  0.115183   
SITE03      0.295238  0.023810  0.023810  0.033333  0.033333  0.526190   
SITE04      0.098985  0.109137  0.106599  0.076142  0.081218  0.088832   
SITE05      0.077540  0.101604  0.088235  0.098930  0.122995  0.104278   
SITE06      0.087500  0.090000  0.087500  0.085000  0.117500  0.097500   

last_digit         6         7         8         9  
SITEID                                              
SITE01      0.091429  0.102857  0.100000  0.088571  
SITE02      0.091623  0.086387  0.102094  0.102094  
SITE03      0.026190  0.011905  0.011905  0.014286  
SITE04      0.101523  0.126904  0.126904  0.083756  
SITE05      0.101604  0.106952  0.122995  0.074866  
SITE06      0.102500  0.120000  0.115000  0.097500

digit_table = pd.crosstab(df["SITEID"], df["last_digit"], normalize="index").round(3)

plt.figure(figsize=(10, 4))
sns.heatmap(digit_table, annot=True, fmt=".2f", cmap="YlOrRd",
            linewidths=0.5, cbar_kws={"label": "Proportion"})
plt.title("Last digit distribution by site — SITE03 shows extreme preference for 0 and 5")
plt.xlabel("Last digit of BP value")
plt.tight_layout()
plt.show()

post_base = df[df["AVISITN"] > 1]

variability = (
    post_base.groupby(["SITEID", "PARAMCD"])["CHG"]
    .std()
    .reset_index(name="sd_change")
)

print(variability)

for param in ["SYSBP", "DIABP"]:
    temp = variability[variability["PARAMCD"] == param]

    plt.figure(figsize=(7,4))
    plt.bar(temp["SITEID"], temp["sd_change"])
    plt.title(f"Variability in Change from Baseline: {param}")
    plt.xlabel("Site")
    plt.ylabel("Standard deviation of change")
    plt.grid(axis="y", alpha=0.3)
    plt.tight_layout()
    plt.show()

    SITEID PARAMCD  sd_change
0   SITE01   DIABP   4.962773
1   SITE01   SYSBP   7.369588
2   SITE02   DIABP   5.630034
3   SITE02   SYSBP   9.200822
4   SITE03   DIABP   3.327271
5   SITE03   SYSBP   5.223485
6   SITE04   DIABP   5.778665
7   SITE04   SYSBP   8.947973
8   SITE05   DIABP   6.394070
9   SITE05   SYSBP   7.268345
10  SITE06   DIABP   5.131955
11  SITE06   SYSBP   7.276693

visits = (
    df.drop_duplicates(["SITEID", "USUBJID", "AVISITN", "ADT"])
    .sort_values(["SITEID", "USUBJID", "AVISITN"])
)

visits_per_subject = (
    visits.groupby(["SITEID", "USUBJID"])["AVISITN"]
    .nunique()
    .reset_index(name="n_visits")
)

completion = (
    visits_per_subject.groupby("SITEID")
    .agg(
        mean_visits=("n_visits", "mean"),
        full_completion=("n_visits", lambda x: (x == 7).mean())
    )
    .reset_index()
)

print(completion)

plt.figure(figsize=(7,4))
plt.bar(completion["SITEID"], completion["full_completion"])
plt.title("Proportion of Subjects With Complete Visits")
plt.xlabel("Site")
plt.ylabel("Proportion with all 7 visits")
plt.ylim(0, 1.05)
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

   SITEID  mean_visits  full_completion
0  SITE01     5.833333         0.666667
1  SITE02     6.366667         0.766667
2  SITE03     7.000000         1.000000
3  SITE04     6.566667         0.700000
4  SITE05     6.233333         0.700000
5  SITE06     6.666667         0.833333

visits["days_between"] = (
    visits.groupby(["SITEID", "USUBJID"])["ADT"]
    .diff()
    .dt.days
)

intervals = visits.dropna(subset=["days_between"])

interval_summary = (
    intervals.groupby("SITEID")["days_between"]
    .agg(["mean", "std", "min", "max"])
    .reset_index()
)

print(interval_summary)

plt.figure(figsize=(7,4))
plt.bar(interval_summary["SITEID"], interval_summary["std"])
plt.title("Visit Timing Variability by Site")
plt.xlabel("Site")
plt.ylabel("Standard deviation of days between visits")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

   SITEID       mean       std   min   max
0  SITE01  28.124138  2.938922  23.0  34.0
1  SITE02  27.987578  2.537192  22.0  34.0
2  SITE03  28.000000  0.000000  28.0  28.0
3  SITE04  27.976048  2.744001  22.0  34.0
4  SITE05  27.834395  3.094344  22.0  34.0
5  SITE06  27.894118  2.782122  22.0  34.0

suspect_site = "SITE03"

for param in ["SYSBP", "DIABP"]:
    temp = df[(df["SITEID"] == suspect_site) & (df["PARAMCD"] == param)]

    plt.figure(figsize=(8,5))

    for subject, group in temp.groupby("USUBJID"):
        group = group.sort_values("AVISITN")
        plt.plot(group["AVISITN"], group["AVAL"], alpha=0.4)

    plt.title(f"Patient BP Trajectories at {suspect_site}: {param}")
    plt.xlabel("Visit")
    plt.ylabel("Blood Pressure")
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharey=False)
fig.suptitle("Patient BP Trajectories by Site — DIABP", fontsize=14, fontweight="bold")

for ax, site in zip(axes.flatten(), sites):
    site_data = df[(df["SITEID"] == site) & (df["PARAMCD"] == "DIABP")]
    for subject, group in site_data.groupby("USUBJID"):
        group = group.sort_values("AVISITN")
        color = "#E24B4A" if site == "SITE03" else "#1D9E75"
        ax.plot(group["AVISITN"], group["AVAL"], alpha=0.4, color=color, linewidth=1)

    ax.set_title(site, fontweight="bold",
                 color="#E24B4A" if site == "SITE03" else "black")
    ax.set_xlabel("Visit")
    ax.set_ylabel("Diastolic BP (mmHg)")
    ax.grid(alpha=0.3)
    ax.spines[["top", "right"]].set_visible(False)

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharey=False)
fig.suptitle("Patient BP Trajectories by Site — SYSBP", fontsize=14, fontweight="bold")

for ax, site in zip(axes.flatten(), sites):
    site_data = df[(df["SITEID"] == site) & (df["PARAMCD"] == "SYSBP")]
    for subject, group in site_data.groupby("USUBJID"):
        group = group.sort_values("AVISITN")
        color = "#E24B4A" if site == "SITE03" else "#1D9E75"
        ax.plot(group["AVISITN"], group["AVAL"], alpha=0.4, color=color, linewidth=1)

    ax.set_title(site, fontweight="bold",
                 color="#E24B4A" if site == "SITE03" else "black")
    ax.set_xlabel("Visit")
    ax.set_ylabel("Systolic BP (mmHg)")
    ax.grid(alpha=0.3)
    ax.spines[["top", "right"]].set_visible(False)

plt.tight_layout()
plt.show()

# Digit preference
digit_pref = (
    df.groupby("SITEID")["round_0_5"]
    .mean()
    .reset_index(name="round_digit_rate")
)

# Variability
site_var = (
    post_base.groupby("SITEID")["CHG"]
    .std()
    .reset_index(name="change_sd")
)

# Completion
completion = completion[["SITEID", "full_completion"]]

# Visit timing variability
interval_sd = interval_summary[["SITEID", "std"]].rename(columns={"std": "visit_interval_sd"})

# Combine
fraud_indicators = (
    digit_pref
    .merge(site_var, on="SITEID")
    .merge(completion, on="SITEID")
    .merge(interval_sd, on="SITEID")
)

print(fraud_indicators)

   SITEID  round_digit_rate  change_sd  full_completion  visit_interval_sd
0  SITE01          0.242857   6.277419         0.666667           2.938922
1  SITE02          0.222513   7.831257         0.766667           2.537192
2  SITE03          0.821429   4.621213         1.000000           0.000000
3  SITE04          0.187817   7.634920         0.700000           2.744001
4  SITE05          0.181818   6.835951         0.700000           3.094344
5  SITE06          0.185000   6.339367         0.833333           2.782122

# Map your existing variables to the dashboard
sites = fraud_indicators["SITEID"].tolist()
round_vals  = fraud_indicators.set_index("SITEID")["round_digit_rate"] * 100
sd_vals     = fraud_indicators.set_index("SITEID")["change_sd"]
comp_vals   = fraud_indicators.set_index("SITEID")["full_completion"] * 100
int_vals    = fraud_indicators.set_index("SITEID")["visit_interval_sd"]

fraud_site = "SITE03"
colors = ["#E24B4A" if s == fraud_site else "#1D9E75" for s in sites]

import matplotlib.gridspec as gridspec

fig = plt.figure(figsize=(14, 10), facecolor="white")
fig.suptitle("Clinical Trial Fraud Detection — Site-Level Analysis",
             fontsize=15, fontweight="bold", y=0.98)

gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.45, wspace=0.35)

# Panel 1: Digit preference
ax1 = fig.add_subplot(gs[0, 0])
ax1.bar(sites, round_vals[sites], color=colors, edgecolor="white")
ax1.set_ylabel("% values ending in 0 or 5")
ax1.set_title("Digit preference", fontweight="bold")
ax1.set_ylim(0, 100)
for i, (s, v) in enumerate(zip(sites, round_vals[sites])):
    ax1.text(i, v + 1, f"{v:.0f}%", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax1.spines[["top", "right"]].set_visible(False)

# Panel 2: CHG variability
ax2 = fig.add_subplot(gs[0, 1])
ax2.bar(sites, sd_vals[sites], color=colors, edgecolor="white")
ax2.set_ylabel("SD of CHG (mmHg)")
ax2.set_title("CHG variability", fontweight="bold")
ax2.set_ylim(0, 12)
for i, (s, v) in enumerate(zip(sites, sd_vals[sites])):
    ax2.text(i, v + 0.15, f"{v:.1f}", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax2.spines[["top", "right"]].set_visible(False)

# Panel 3: Completion rate
ax3 = fig.add_subplot(gs[1, 0])
ax3.bar(sites, comp_vals[sites], color=colors, edgecolor="white")
ax3.set_ylabel("% subjects completing all visits")
ax3.set_title("Visit completion rate", fontweight="bold")
ax3.set_ylim(0, 115)
for i, (s, v) in enumerate(zip(sites, comp_vals[sites])):
    ax3.text(i, v + 1, f"{v:.0f}%", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax3.spines[["top", "right"]].set_visible(False)

# Panel 4: Visit interval SD
ax4 = fig.add_subplot(gs[1, 1])
ax4.bar(sites, int_vals[sites], color=colors, edgecolor="white")
ax4.set_ylabel("SD of days between visits")
ax4.set_title("Visit scheduling variability", fontweight="bold")
ax4.set_ylim(0, 4.5)
for i, (s, v) in enumerate(zip(sites, int_vals[sites])):
    ax4.text(i, v + 0.05, f"{v:.2f}", ha="center", fontsize=9,
             color="#E24B4A" if s == fraud_site else "#444")
ax4.spines[["top", "right"]].set_visible(False)

# Verdict banner
fig.text(0.5, 0.01,
    "⚠  SITE03 flagged: 82% digit preference  |  100% completion  |  "
    "0-day visit interval SD  |  Lowest CHG variability — recommend audit",
    ha="center", fontsize=10, color="#A32D2D",
    bbox=dict(boxstyle="round,pad=0.4", facecolor="#FCEBEB",
              edgecolor="#E24B4A", linewidth=1.5))

plt.savefig("fraud_detection.png", dpi=150, bbox_inches="tight")
plt.show()

score = fraud_indicators.copy()

# Higher round_digit_rate = more suspicious
score["round_score"] = (
    (score["round_digit_rate"] - score["round_digit_rate"].mean())
    / score["round_digit_rate"].std()
)

# Lower variability = more suspicious
score["low_variability_score"] = (
    (score["change_sd"].mean() - score["change_sd"])
    / score["change_sd"].std()
)

# Higher completion = more suspicious
score["completion_score"] = (
    (score["full_completion"] - score["full_completion"].mean())
    / score["full_completion"].std()
)

# Lower visit interval SD = more suspicious
score["regular_visit_score"] = (
    (score["visit_interval_sd"].mean() - score["visit_interval_sd"])
    / score["visit_interval_sd"].std()
)

score["overall_suspicion_score"] = (
    score["round_score"]
    + score["low_variability_score"]
    + score["completion_score"]
    + score["regular_visit_score"]
)

score = score.sort_values("overall_suspicion_score", ascending=False)

print(score[[
    "SITEID",
    "round_digit_rate",
    "change_sd",
    "full_completion",
    "visit_interval_sd",
    "overall_suspicion_score"
]])


bar_colors = ['#E24B4A' if s == 'SITE03' else '#1D9E75'
              for s in score["SITEID"]]

plt.figure(figsize=(7, 4))
plt.bar(score["SITEID"], score["overall_suspicion_score"], color=bar_colors)
plt.title("Overall suspicion score by site")
plt.xlabel("Site")
plt.ylabel("Suspicion score (z-score composite)")
plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

   SITEID  round_digit_rate  change_sd  full_completion  visit_interval_sd  \
2  SITE03          0.821429   4.621213         1.000000           0.000000   
5  SITE06          0.185000   6.339367         0.833333           2.782122   
0  SITE01          0.242857   6.277419         0.666667           2.938922   
1  SITE02          0.222513   7.831257         0.766667           2.537192   
4  SITE05          0.181818   6.835951         0.700000           3.094344   
3  SITE04          0.187817   7.634920         0.700000           2.744001   

   overall_suspicion_score  
2                 7.533657  
5                -0.188782  
0                -1.384082  
1                -1.653663  
4                -1.971274  
3                -2.335856

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

path = "/content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.ipynb"
!jupyter nbconvert --to html --embed-images "{path}"

html_path = path.replace(".ipynb", ".html")
from google.colab import files
files.download(html_path)

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 13 image(s).
[NbConvertApp] Writing 2986897 bytes to /content/drive/MyDrive/Colab Notebooks/Fraud in clinical trials.html

1. Digit preference: round numbers¶

2. Last digit distribution by site¶

3. Variability: fake data is often too consistent¶

4. Missing data / dropout¶

5. Visit regularity¶

Summary dashboard¶

Overall suspicion score¶

Conclusion¶

Create HTML¶

	STUDYID	USUBJID	SUBJID	SITEID	COUNTRY	AGE	SEX	RACE	TRT01P	TRT01PN	...	AVISITN	PARAMCD	PARAM	AVAL	BASE	CHG	PCHG	ABLFL	ANL01FL	DTYPE
0	BP-RCT-001	SITE01-001	1	SITE01	GBR	70	M	WHITE	ACTIVE	1	...	1	DIABP	Diastolic Blood Pressure (mmHg)	109	109	NaN	NaN	Y	Y	NaN
1	BP-RCT-001	SITE01-001	1	SITE01	GBR	70	M	WHITE	ACTIVE	1	...	2	DIABP	Diastolic Blood Pressure (mmHg)	107	109	-2.0	-1.834862	NaN	Y	NaN
2	BP-RCT-001	SITE01-001	1	SITE01	GBR	70	M	WHITE	ACTIVE	1	...	3	DIABP	Diastolic Blood Pressure (mmHg)	108	109	-1.0	-0.917431	NaN	Y	NaN
3	BP-RCT-001	SITE01-001	1	SITE01	GBR	70	M	WHITE	ACTIVE	1	...	4	DIABP	Diastolic Blood Pressure (mmHg)	104	109	-5.0	-4.587156	NaN	Y	NaN
4	BP-RCT-001	SITE01-001	1	SITE01	GBR	70	M	WHITE	ACTIVE	1	...	5	DIABP	Diastolic Blood Pressure (mmHg)	102	109	-7.0	-6.422018	NaN	Y	NaN