# Data Loading Section
## Import Required Libraries
import os
import sys
import pandas as pd
import numpy as np
from pymongo import MongoClient
from pymongo.errors import PyMongoError
# ML/AI Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Add project root to path for importing local modules
sys.path.append('../')
from data_engine.cosmos_driver import get_mongo_client, get_collection
# Load Card Data from Cosmos DB
def load_card_data():
"""
Load all card data from Cosmos DB into a pandas DataFrame.
"""
try:
# Connect to Cosmos DB
client = get_mongo_client()
# Get database and collection names from environment or defaults
database_name = os.environ.get('COSMOS_DB_NAME', 'cards')
container_name = os.environ.get('COSMOS_CONTAINER_NAME', 'mtgecorec')
# Get collection
collection = get_collection(client, container_name, database_name)
# Fetch all documents
cards = list(collection.find())
# Convert to DataFrame
df = pd.DataFrame(cards)
# Remove MongoDB _id column if present
if '_id' in df.columns:
df = df.drop('_id', axis=1)
print(f"Successfully loaded {len(df)} cards from Cosmos DB")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
return df
except Exception as e:
print(f"Error loading data from Cosmos DB: {e}")
return None
finally:
if 'client' in locals():
client.close()
# Load the data
df_cards = load_card_data()
# Display first few rows
if df_cards is not None:
display(df_cards.head())
else:
print("Failed to load card data. Please check your Cosmos DB connection and environment variables.")
Data Exploration¶
In this section, we'll explore the loaded card data to understand its structure, distributions, and key characteristics.
# Basic Data Exploration
if df_cards is not None:
print("DataFrame Info:")
print(df_cards.info())
print("\n" + "="*50)
print("Descriptive Statistics:")
print(df_cards.describe())
print("\n" + "="*50)
print("Missing Values:")
print(df_cards.isnull().sum())
print("\n" + "="*50)
print("Unique Values in Categorical Columns:")
categorical_cols = df_cards.select_dtypes(include=['object']).columns
for col in categorical_cols:
try:
unique_count = df_cards[col].nunique()
print(f"{col}: {unique_count} unique values")
if unique_count <= 10:
print(f" Values: {df_cards[col].unique()}")
except TypeError:
# Handle unhashable types like lists/dicts
print(f"{col}: Contains complex data types (lists/dicts), {len(df_cards)} total values")
else:
print("No data available for exploration.")
Data Preprocessing¶
Clean and prepare the data for analysis. Handle missing values, encode categorical variables, normalize numerical features, etc.
# Example preprocessing steps (uncomment and modify as needed)
# # Handle missing values
# df_cards = df_cards.dropna() # or use imputation strategies
# # Encode categorical variables
# le = LabelEncoder()
# df_cards['rarity_encoded'] = le.fit_transform(df_cards['rarity'])
# # Normalize numerical features
# scaler = StandardScaler()
# numerical_cols = ['price'] # add other numerical columns
# df_cards[numerical_cols] = scaler.fit_transform(df_cards[numerical_cols])
Feature Engineering¶
Create new features from existing data that might be useful for ML models.
# Example feature engineering (uncomment and modify as needed)
# # Extract features from text fields
# df_cards['name_length'] = df_cards['name'].str.len()
# df_cards['has_flavor_text'] = df_cards['flavor_text'].notna()
# # Create color-based features
# df_cards['color_count'] = df_cards['colors'].apply(lambda x: len(x) if isinstance(x, list) else 0)
# df_cards['is_multicolored'] = df_cards['color_count'] > 1
# # Date features
# df_cards['release_year'] = pd.to_datetime(df_cards['release_date']).dt.year
Model Development¶
Build and train ML models for various tasks such as price prediction, rarity classification, etc.
# Example model development (uncomment and modify as needed)
# # Prepare features and target
# features = ['rarity_encoded', 'color_count', 'release_year'] # add relevant features
# target = 'price' # or another target variable
# X = df_cards[features]
# y = df_cards[target]
# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# # Train model
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)
# # Make predictions
# y_pred = model.predict(X_test)
Model Evaluation¶
Evaluate model performance using appropriate metrics.
# Example evaluation (uncomment and modify as needed)
# # Classification report
# print(classification_report(y_test, y_pred))
# # Confusion matrix
# cm = confusion_matrix(y_test, y_pred)
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.title('Confusion Matrix')
# plt.show()
# # Feature importance
# feature_importance = pd.DataFrame({
# 'feature': features,
# 'importance': model.feature_importances_
# }).sort_values('importance', ascending=False)
# plt.figure(figsize=(10, 6))
# sns.barplot(x='importance', y='feature', data=feature_importance)
# plt.title('Feature Importance')
# plt.show()
Results and Insights¶
Summarize findings, insights, and potential next steps.
- Key findings from data exploration
- Model performance summary
- Business insights
- Recommendations for future analysis
Next Steps¶
- Fine-tune model hyperparameters
- Try different algorithms (XGBoost, Neural Networks, etc.)
- Perform cross-validation
- Deploy model for predictions
- Set up monitoring and retraining pipeline
# Convert notebook to HTML for web display
import sys
sys.path.append('../scripts')
from convert_notebook import convert_notebook_to_html
# Run the conversion
convert_notebook_to_html()
Notebook converted to HTML and saved to static/card_explore.html