316 lines
13 KiB
Python
316 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Script to check the shape of *.npy files in save_for_eval/AdvBench/alpaca_7B_jailbreak
|
||
load them as benign vs adverse for comparison, and perform PCA analysis to identify
|
||
the most discriminative layers.
|
||
"""
|
||
|
||
import numpy as np
|
||
import os
|
||
import glob
|
||
from pathlib import Path
|
||
from sklearn.decomposition import PCA
|
||
from sklearn.preprocessing import StandardScaler
|
||
import matplotlib.pyplot as plt
|
||
|
||
def check_npy_shapes(directory_path):
|
||
"""
|
||
Check shapes of all .npy files in the specified directory
|
||
and load them as benign vs adverse pairs.
|
||
"""
|
||
# Get all .npy files in the directory
|
||
npy_files = glob.glob(os.path.join(directory_path, "*.npy"))
|
||
|
||
if not npy_files:
|
||
print(f"No .npy files found in {directory_path}")
|
||
return
|
||
|
||
print(f"Found {len(npy_files)} .npy files:")
|
||
print("-" * 50)
|
||
|
||
# Separate files into benign and adverse
|
||
benign_files = [f for f in npy_files if "benign" in f.lower()]
|
||
adverse_files = [f for f in npy_files if "adverse" in f.lower()]
|
||
|
||
# Check shapes and load data
|
||
benign_data = {}
|
||
adverse_data = {}
|
||
|
||
# Process benign files
|
||
print("BENIGN FILES:")
|
||
for file_path in benign_files:
|
||
try:
|
||
data = np.load(file_path)
|
||
file_name = os.path.basename(file_path)
|
||
benign_data[file_name] = data
|
||
print(f" {file_name}: shape {data.shape}, dtype {data.dtype}")
|
||
except Exception as e:
|
||
print(f" Error loading {file_path}: {e}")
|
||
|
||
print("\nADVERSE FILES:")
|
||
for file_path in adverse_files:
|
||
try:
|
||
data = np.load(file_path)
|
||
file_name = os.path.basename(file_path)
|
||
adverse_data[file_name] = data
|
||
print(f" {file_name}: shape {data.shape}, dtype {data.dtype}")
|
||
except Exception as e:
|
||
print(f" Error loading {file_path}: {e}")
|
||
|
||
print("\n" + "="*50)
|
||
print("COMPARISON SUMMARY:")
|
||
print("="*50)
|
||
|
||
# Compare corresponding benign and adverse files
|
||
for benign_name in benign_data:
|
||
# Find corresponding adverse file
|
||
adverse_name = benign_name.replace("benign", "adverse")
|
||
if adverse_name in adverse_data:
|
||
benign_array = benign_data[benign_name]
|
||
adverse_array = adverse_data[adverse_name]
|
||
|
||
print(f"\nComparing {benign_name} vs {adverse_name}:")
|
||
print(f" Shapes: {benign_array.shape} vs {adverse_array.shape}")
|
||
print(f" Dtypes: {benign_array.dtype} vs {adverse_array.dtype}")
|
||
|
||
if benign_array.shape == adverse_array.shape:
|
||
print(f" ✓ Shapes match")
|
||
|
||
# Calculate some basic statistics for comparison
|
||
diff = np.abs(benign_array - adverse_array)
|
||
print(f" Mean absolute difference: {np.mean(diff):.6f}")
|
||
print(f" Max absolute difference: {np.max(diff):.6f}")
|
||
print(f" Min absolute difference: {np.min(diff):.6f}")
|
||
else:
|
||
print(f" ✗ Shapes don't match!")
|
||
|
||
print("-" * 30)
|
||
|
||
return benign_data, adverse_data
|
||
|
||
def reshape_for_layer_analysis(data_array, n_samples=234, n_layers=15):
|
||
"""
|
||
Reshape data from (3510, *, 4096) to (n_samples, n_layers, *, 4096)
|
||
where 3510 = 234 samples × 15 layers
|
||
"""
|
||
if data_array.shape[0] != 3510:
|
||
raise ValueError(f"Expected first dimension to be 3510, got {data_array.shape[0]}")
|
||
|
||
# Reshape to (234, 15, *, 4096)
|
||
if len(data_array.shape) == 3:
|
||
# For head_wise and mlp_wise: (3510, 32, 4096) -> (234, 15, 32, 4096)
|
||
# For layer_wise: (3510, 33, 4096) -> (234, 15, 33, 4096)
|
||
reshaped = data_array.reshape(n_samples, n_layers, data_array.shape[1], data_array.shape[2])
|
||
elif len(data_array.shape) == 2:
|
||
# For simpler cases: (3510, 4096) -> (234, 15, 4096)
|
||
reshaped = data_array.reshape(n_samples, n_layers, data_array.shape[1])
|
||
else:
|
||
raise ValueError(f"Unsupported array shape: {data_array.shape}")
|
||
|
||
return reshaped
|
||
|
||
def perform_pca_analysis(benign_data, adverse_data):
|
||
"""
|
||
Perform PCA analysis to identify the most discriminative components
|
||
(heads or layers) between benign and adverse data.
|
||
"""
|
||
print("\n" + "="*60)
|
||
print("PCA ANALYSIS - IDENTIFYING DISCRIMINATIVE COMPONENTS")
|
||
print("="*60)
|
||
|
||
# Create output directory for visualizations
|
||
os.makedirs("visualizations", exist_ok=True)
|
||
|
||
for benign_name in benign_data:
|
||
adverse_name = benign_name.replace("benign", "adverse")
|
||
if adverse_name in adverse_data:
|
||
benign_array = benign_data[benign_name]
|
||
adverse_array = adverse_data[adverse_name]
|
||
|
||
print(f"\nAnalyzing {benign_name}:")
|
||
print(f" Original shape: {benign_array.shape}")
|
||
|
||
try:
|
||
# Get the number of components (32 for heads/mlp, 33 for layers)
|
||
n_components = benign_array.shape[1]
|
||
component_type = "head" if "head" in benign_name else ("layer" if "layer" in benign_name else "mlp")
|
||
|
||
print(f" Analyzing {n_components} {component_type}s")
|
||
|
||
# Prepare data for PCA: analyze each component separately
|
||
component_discrimination_scores = []
|
||
all_pca_results = []
|
||
|
||
for comp_idx in range(n_components):
|
||
# Extract data for this specific component across all samples
|
||
benign_comp_data = benign_array[:, comp_idx, :].reshape(-1, 4096)
|
||
adverse_comp_data = adverse_array[:, comp_idx, :].reshape(-1, 4096)
|
||
|
||
# Combine benign and adverse data for this component
|
||
X = np.vstack([benign_comp_data, adverse_comp_data])
|
||
y = np.hstack([np.zeros(len(benign_comp_data)), np.ones(len(adverse_comp_data))])
|
||
|
||
# Standardize the data
|
||
scaler = StandardScaler()
|
||
X_scaled = scaler.fit_transform(X)
|
||
|
||
# Perform PCA
|
||
pca = PCA(n_components=2)
|
||
X_pca = pca.fit_transform(X_scaled)
|
||
|
||
# Calculate separation score (variance explained by PC1)
|
||
separation_score = pca.explained_variance_ratio_[0]
|
||
component_discrimination_scores.append(separation_score)
|
||
|
||
# Store PCA results for visualization
|
||
all_pca_results.append({
|
||
'component': comp_idx + 1,
|
||
'X_pca': X_pca,
|
||
'y': y,
|
||
'pca': pca,
|
||
'score': separation_score
|
||
})
|
||
|
||
print(f" {component_type.capitalize()} {comp_idx + 1}: PCA separation score = {separation_score:.4f}")
|
||
|
||
# Create visualizations
|
||
create_component_visualizations(all_pca_results, component_discrimination_scores, component_type)
|
||
|
||
# Identify the most discriminative components
|
||
sorted_components = np.argsort(component_discrimination_scores)[::-1]
|
||
print(f"\n Most discriminative {component_type}s (highest to lowest):")
|
||
for i, comp_idx in enumerate(sorted_components):
|
||
print(f" {component_type.capitalize()} {comp_idx + 1}: {component_discrimination_scores[comp_idx]:.4f}")
|
||
|
||
# Analyze the top discriminative component in more detail
|
||
top_comp_idx = sorted_components[0]
|
||
print(f"\n Detailed analysis of top {component_type} {top_comp_idx + 1}:")
|
||
|
||
# Extract data for top component
|
||
benign_top_comp = benign_array[:, top_comp_idx, :].reshape(-1, 4096)
|
||
adverse_top_comp = adverse_array[:, top_comp_idx, :].reshape(-1, 4096)
|
||
|
||
X_top = np.vstack([benign_top_comp, adverse_top_comp])
|
||
y_top = np.hstack([np.zeros(len(benign_top_comp)), np.ones(len(adverse_top_comp))])
|
||
|
||
scaler_top = StandardScaler()
|
||
X_top_scaled = scaler_top.fit_transform(X_top)
|
||
|
||
pca_top = PCA(n_components=2)
|
||
X_top_pca = pca_top.fit_transform(X_top_scaled)
|
||
|
||
# Calculate class separation in PCA space
|
||
benign_pca = X_top_pca[y_top == 0]
|
||
adverse_pca = X_top_pca[y_top == 1]
|
||
|
||
centroid_benign = np.mean(benign_pca, axis=0)
|
||
centroid_adverse = np.mean(adverse_pca, axis=0)
|
||
distance = np.linalg.norm(centroid_benign - centroid_adverse)
|
||
|
||
print(f" Distance between class centroids: {distance:.4f}")
|
||
print(f" Variance explained: PC1={pca_top.explained_variance_ratio_[0]:.4f}, PC2={pca_top.explained_variance_ratio_[1]:.4f}")
|
||
print(f" Total samples analyzed: {len(X_top)}")
|
||
|
||
except Exception as e:
|
||
print(f" Error analyzing data: {e}")
|
||
continue
|
||
|
||
def create_component_visualizations(pca_results, discrimination_scores, component_type):
|
||
"""
|
||
Create visualizations for PCA analysis results.
|
||
"""
|
||
# Plot 1: Component discrimination scores
|
||
plt.figure(figsize=(15, 6))
|
||
components = [result['component'] for result in pca_results]
|
||
scores = discrimination_scores
|
||
|
||
plt.bar(components, scores, alpha=0.7, color='skyblue')
|
||
plt.xlabel(f'{component_type.capitalize()} Number')
|
||
plt.ylabel('PCA Separation Score')
|
||
plt.title(f'{component_type.capitalize()} Discrimination Scores')
|
||
plt.xticks(components)
|
||
plt.grid(True, alpha=0.3)
|
||
|
||
# Highlight top 5 components
|
||
top_indices = np.argsort(scores)[-5:][::-1]
|
||
colors = ['red', 'orange', 'green', 'purple', 'brown']
|
||
for i, idx in enumerate(top_indices):
|
||
plt.bar(components[idx], scores[idx], color=colors[i], alpha=0.8,
|
||
label=f'{component_type.capitalize()} {components[idx]}: {scores[idx]:.3f}')
|
||
|
||
plt.legend()
|
||
plt.tight_layout()
|
||
plt.savefig(f'visualizations/{component_type}_scores.png', dpi=300, bbox_inches='tight')
|
||
plt.close()
|
||
|
||
# Plot 2: PCA scatter plots for top 3 components
|
||
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
|
||
top_indices = np.argsort(scores)[-3:][::-1]
|
||
|
||
for i, idx in enumerate(top_indices):
|
||
result = pca_results[idx]
|
||
X_pca = result['X_pca']
|
||
y = result['y']
|
||
|
||
# Plot benign vs adverse in PCA space
|
||
benign_mask = y == 0
|
||
adverse_mask = y == 1
|
||
|
||
axes[i].scatter(X_pca[benign_mask, 0], X_pca[benign_mask, 1],
|
||
alpha=0.6, label='Benign', s=10, color='blue')
|
||
axes[i].scatter(X_pca[adverse_mask, 0], X_pca[adverse_mask, 1],
|
||
alpha=0.6, label='Adverse', s=10, color='red')
|
||
|
||
axes[i].set_xlabel('PC1')
|
||
axes[i].set_ylabel('PC2')
|
||
axes[i].set_title(f'{component_type.capitalize()} {result["component"]} (Score: {result["score"]:.3f})')
|
||
axes[i].legend()
|
||
axes[i].grid(True, alpha=0.3)
|
||
|
||
plt.suptitle(f'PCA Visualization - Top 3 Discriminative {component_type.capitalize()}s')
|
||
plt.tight_layout()
|
||
plt.savefig(f'visualizations/{component_type}_pca_scatter.png', dpi=300, bbox_inches='tight')
|
||
plt.close()
|
||
|
||
# Plot 3: Cumulative variance explained
|
||
plt.figure(figsize=(12, 6))
|
||
cumulative_scores = np.cumsum(np.sort(scores)[::-1])
|
||
plt.plot(range(1, len(scores) + 1), cumulative_scores, 'o-', linewidth=2, markersize=8)
|
||
plt.xlabel(f'Number of {component_type.capitalize()}s')
|
||
plt.ylabel('Cumulative Discrimination Score')
|
||
plt.title(f'Cumulative Discrimination - {component_type.capitalize()}s')
|
||
plt.grid(True, alpha=0.3)
|
||
plt.xticks(range(1, len(scores) + 1))
|
||
plt.tight_layout()
|
||
plt.savefig(f'visualizations/{component_type}_cumulative_scores.png', dpi=300, bbox_inches='tight')
|
||
plt.close()
|
||
|
||
print(f" Visualizations saved to visualizations/{component_type}_*.png")
|
||
|
||
def main():
|
||
"""Main function to run the script."""
|
||
target_directory = "save_for_eval/AdvBench/alpaca_7B_jailbreak"
|
||
|
||
print(f"Checking .npy files in: {target_directory}")
|
||
print("=" * 60)
|
||
|
||
# Check if directory exists
|
||
if not os.path.exists(target_directory):
|
||
print(f"Error: Directory '{target_directory}' does not exist!")
|
||
return
|
||
|
||
# Check and load the .npy files
|
||
benign_data, adverse_data = check_npy_shapes(target_directory)
|
||
|
||
# Perform PCA analysis to identify discriminative layers
|
||
if benign_data and adverse_data:
|
||
perform_pca_analysis(benign_data, adverse_data)
|
||
|
||
print("\nScript completed successfully!")
|
||
|
||
# Return the loaded data for further analysis if needed
|
||
return benign_data, adverse_data
|
||
|
||
if __name__ == "__main__":
|
||
main()
|