SHIKARICHACHA
/

bank_statement_analysis

Model card Files Files and versions

xet

Community

SHIKARICHACHA commited on Jul 13

Commit

0474efa

verified ·

1 Parent(s): c68a3b2

Update app.py

Browse files

Files changed (1) hide show

app.py +335 -112

app.py CHANGED Viewed

@@ -4,12 +4,18 @@ import re
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
-from plotly.subplots import make_subplots
 import numpy as np
 from datetime import datetime, timedelta
 import sqlite3
 import hashlib
-import json
 from http.server import HTTPServer, SimpleHTTPRequestHandler
 # Advanced ML and NLP imports
@@ -26,7 +32,7 @@ except ImportError as e:
 # Configuration
 st.set_page_config(
-    page_title='Enhanced Bank Statement Analysis',
     page_icon='🏦',
     layout='wide',
     initial_sidebar_state='expanded'
@@ -39,10 +45,11 @@ if 'transactions_df' not in st.session_state:
     st.session_state.transactions_df = None
 if 'analysis_complete' not in st.session_state:
     st.session_state.analysis_complete = False
 class DatabaseManager:
     """Handles all database operations for user profiles and financial data"""
     def __init__(self):
         self.db_path = 'financial_analysis.db'
         self.init_database()
@@ -89,6 +96,7 @@ class DatabaseManager:
             CREATE TABLE IF NOT EXISTS financial_data (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 user_id TEXT NOT NULL,
                 date TEXT NOT NULL,
                 description TEXT,
                 amount REAL,
@@ -104,6 +112,7 @@ class DatabaseManager:
             CREATE TABLE IF NOT EXISTS recommendations (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 user_id TEXT NOT NULL,
                 recommendation_type TEXT,
                 title TEXT,
                 description TEXT,
@@ -158,13 +167,13 @@ class DatabaseManager:
         conn.close()
         if result:
-            columns = ['id', 'user_id', 'name', 'email', 'password_hash', 'financial_goals', 'risk_tolerance', 'monthly_income', 'created_at']
             return dict(zip(columns, result))
         return None
 class PersonalizationEngine:
     """Advanced personalization and recommendation system"""
     def __init__(self, user_profile=None):
         self.user_profile = user_profile
         self.scaler = StandardScaler()
@@ -293,7 +302,6 @@ class PersonalizationEngine:
 class AdvancedAnalytics:
     """Advanced analytics and ML models for financial analysis"""
     def __init__(self):
         self.models = {}
         self.scaler = StandardScaler()
@@ -347,85 +355,153 @@ class AdvancedAnalytics:
             'Predicted_Amount': future_predictions
         })
-    def enhanced_loan_prediction(self, df):
-        """Enhanced loan eligibility prediction using XGBoost"""
         try:
             # Load training data
-            training_data = pd.read_csv('absa.csv')
-            # Prepare features
             total_credits = df[df['Amount'] > 0]['Amount'].sum()
             total_debits = abs(df[df['Amount'] < 0]['Amount'].sum())
             num_transactions = len(df)
-            # Advanced features
-            avg_transaction_amount = df['Amount'].mean()
-            transaction_variability = df['Amount'].std()
-            balance_trend = df['Balance'].iloc[-1] - df['Balance'].iloc[0] if len(df) > 1 else 0
-            # Additional features
-            credit_frequency = len(df[df['Amount'] > 0]) / max(1, len(df))
-            max_single_debit = abs(df[df['Amount'] < 0]['Amount'].min()) if len(df[df['Amount'] < 0]) > 0 else 0
-            balance_volatility = df['Balance'].std()
             # Prepare feature vector
             features = pd.DataFrame({
-                'total_credits': [total_credits],
-                'total_debits': [total_debits],
-                'num_transactions': [num_transactions],
-                'avg_transaction_amount': [avg_transaction_amount],
-                'transaction_variability': [transaction_variability],
-                'balance_trend': [balance_trend],
-                'credit_frequency': [credit_frequency],
-                'max_single_debit': [max_single_debit],
-                'balance_volatility': [balance_volatility]
             })
-            # Train XGBoost model if available
-            if 'xgb' in globals():
-                X_train = training_data[['total_credits', 'total_debits', 'num_transactions',
-                                       'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
-                y_train = training_data['Eligibility (y)']
-                # Add missing features with defaults
-                for col in features.columns:
-                    if col not in X_train.columns:
-                        X_train[col] = 0
-                model = xgb.XGBClassifier(random_state=42)
-                model.fit(X_train[features.columns], y_train)
-                prediction = model.predict(features)[0]
-                prediction_proba = model.predict_proba(features)[0]
-                return {
-                    'eligible': bool(prediction),
-                    'confidence': float(max(prediction_proba)),
-                    'model_type': 'XGBoost'
-                }
-            else:
-                # Fallback to Random Forest
-                model = RandomForestClassifier(n_estimators=100, random_state=42)
-                X_train = training_data[['total_credits', 'total_debits', 'num_transactions',
-                                       'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
-                y_train = training_data['Eligibility (y)']
-                model.fit(X_train, y_train)
-                features_basic = features[['total_credits', 'total_debits', 'num_transactions',
-                                         'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
-                prediction = model.predict(features_basic)[0]
-                prediction_proba = model.predict_proba(features_basic)[0]
-                return {
-                    'eligible': bool(prediction),
-                    'confidence': float(max(prediction_proba)),
-                    'model_type': 'Random Forest'
-                }
         except Exception as e:
             st.error(f"Error in loan prediction: {str(e)}")
             return {'eligible': False, 'confidence': 0.0, 'model_type': 'Error'}
 def parse_pdf_enhanced(file):
     """Enhanced PDF parsing with better text extraction"""
     try:
@@ -434,36 +510,132 @@ def parse_pdf_enhanced(file):
             for page in pdf.pages:
                 page_text = page.extract_text()
                 if page_text:
-                    text += page_text
         return text
     except Exception as e:
         st.error(f"Error parsing PDF: {str(e)}")
         return ""
-def process_text_to_df_enhanced(text):
-    """Enhanced text processing with better pattern recognition"""
     transactions = []
-    # Use the exact working pattern from app_original.py as primary
-    transaction_pattern = re.compile(r'(\d{4}-\d{2}-\d{2})\s+(.+?)\s+(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')
-    for line in text.split('\n'):
-        line = line.strip()
-        if not line:
-            continue
-        match = transaction_pattern.search(line)
-        if match:
-            try:
-                date_str, description, amount_str, balance_str = match.groups()
-                # Clean and convert amounts (same logic as app_original.py)
-                amount = float(amount_str.replace(',', '').replace('R', '').replace(' ', ''))
-                balance = float(balance_str.replace(',', '').replace('R', '').replace(' ', ''))
-                transactions.append([date_str, description.strip(), amount, balance])
-            except (ValueError, AttributeError):
-                continue
     if not transactions:
         return pd.DataFrame(columns=['Date', 'Description', 'Amount', 'Balance'])
@@ -471,15 +643,16 @@ def process_text_to_df_enhanced(text):
     df = pd.DataFrame(transactions, columns=['Date', 'Description', 'Amount', 'Balance'])
     df['Date'] = pd.to_datetime(df['Date'])
     df = df.sort_values('Date').reset_index(drop=True)
     return df
-def categorize_expense_enhanced(description):
-    """Enhanced expense categorization using NLP"""
     description_lower = description.lower()
-    # Enhanced categorization with more sophisticated rules
-    category_keywords = {
         'Salary/Income': ['salary', 'wage', 'income', 'payroll', 'refund'],
         'Groceries': ['grocery', 'supermarket', 'food', 'spar', 'checkers', 'woolworths'],
         'Transport': ['fuel', 'petrol', 'uber', 'taxi', 'transport', 'car payment'],
@@ -489,16 +662,36 @@ def categorize_expense_enhanced(description):
         'Shopping': ['retail', 'clothing', 'amazon', 'takealot', 'mall'],
         'Investments': ['investment', 'shares', 'unit trust', 'retirement'],
         'Insurance': ['insurance', 'medical aid', 'life cover', 'short term'],
-        'POS Purchases': ['cashsend mobile', 'pos purchase'],
-        'Payments': ['immediate trf', 'digital payment', 'payment'],
-        'Credits': ['acb credit', 'immediate trf cr', 'credit'],
-        'Bank Charges': ['fees', 'charge', 'commission'],
-        'Cash Transactions': ['atm', 'cash deposit', 'withdrawal'],
         'Cellular': ['airtime', 'data', 'vodacom', 'mtn', 'cell c'],
         'Interest': ['interest'],
         'Failed Transactions': ['unsuccessful', 'declined', 'failed']
     }
     # Check for specific keywords
     for category, keywords in category_keywords.items():
         if any(keyword in description_lower for keyword in keywords):
@@ -513,11 +706,10 @@ def categorize_expense_enhanced(description):
     except:
         pass
-    return 'Others'
 def create_advanced_visualizations(df, patterns, recommendations):
     """Create advanced interactive visualizations"""
     # 1. Financial Health Dashboard
     col1, col2, col3, col4 = st.columns(4)
@@ -589,13 +781,12 @@ def create_advanced_visualizations(df, patterns, recommendations):
 def main():
     """Main application function"""
     # Initialize database
     db = DatabaseManager()
     # Sidebar for user authentication
     with st.sidebar:
-        st.title("🏦 Financial Analysis")
         if st.session_state.user_profile is None:
             tab1, tab2 = st.tabs(["Login", "Sign Up"])
@@ -641,14 +832,22 @@ def main():
                 st.session_state.user_profile = None
                 st.session_state.transactions_df = None
                 st.session_state.analysis_complete = False
                 st.experimental_rerun()
     # Main content area
     if st.session_state.user_profile is None:
         st.markdown("""
-        # 🏦 Enhanced Bank Statement Analysis
-        ### Welcome to the next generation of financial analysis!
         **Key Features:**
         - 🤖 **AI-Powered Insights**: Advanced machine learning for personalized recommendations
@@ -657,29 +856,42 @@ def main():
         - 🔮 **Predictive Analysis**: Forecast future spending trends
         - 🛡️ **Anomaly Detection**: Identify unusual transactions
         - 💡 **Smart Recommendations**: Personalized financial advice
         **Please login or create an account to get started.**
         """)
         return
     # Main analysis interface
-    st.title(f"🏦 Financial Analysis Dashboard - {st.session_state.user_profile['name']}")
     # File upload section
     st.markdown("### 📄 Upload Your Bank Statement")
     uploaded_file = st.file_uploader(
         "Choose a PDF bank statement",
         type="pdf",
-        help="Upload your ABSA bank statement in PDF format for analysis"
     )
     if uploaded_file is not None:
         try:
-            # Parse PDF
-            with st.spinner("🔍 Parsing bank statement..."):
-                text = parse_pdf_enhanced(uploaded_file)
-                df = process_text_to_df_enhanced(text)
             if df.empty:
                 st.warning("⚠️ No transactions found in the uploaded statement. Please check the file format.")
@@ -689,7 +901,7 @@ def main():
             st.session_state.transactions_df = df
             # Enhance data with categories
-            df['Category'] = df['Description'].apply(categorize_expense_enhanced)
             # Initialize analytics engines
             personalization = PersonalizationEngine(st.session_state.user_profile)
@@ -700,11 +912,22 @@ def main():
                 patterns = personalization.analyze_spending_patterns(df)
                 recommendations = personalization.generate_personalized_recommendations(df, patterns)
                 health_score, score_components = personalization.calculate_financial_health_score(df, patterns)
-                loan_prediction = analytics.enhanced_loan_prediction(df)
                 df_with_anomalies = analytics.detect_anomalies(df)
             st.session_state.analysis_complete = True
             # Display results in tabs
             tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
                 "📊 Overview", "💡 Recommendations", "🏥 Health Score",
@@ -902,7 +1125,7 @@ def main():
         except Exception as e:
             st.error(f"❌ An error occurred while processing your statement: {str(e)}")
-            st.info("Please ensure your PDF is a valid ABSA bank statement and try again.")
 if __name__ == "__main__":
     main()
@@ -944,4 +1167,4 @@ st.markdown("""
         color: white;
     }
 </style>
-""", unsafe_allow_html=True)

 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 import numpy as np
 from datetime import datetime, timedelta
 import sqlite3
 import hashlib
+import tempfile
+import PyPDF2
+import os
+import webbrowser
+import threading
+import uuid
+import subprocess
+import time
 from http.server import HTTPServer, SimpleHTTPRequestHandler
 # Advanced ML and NLP imports
 # Configuration
 st.set_page_config(
+    page_title='Universal Bank Statement Analyzer',
     page_icon='🏦',
     layout='wide',
     initial_sidebar_state='expanded'
     st.session_state.transactions_df = None
 if 'analysis_complete' not in st.session_state:
     st.session_state.analysis_complete = False
+if 'detected_bank' not in st.session_state:
+    st.session_state.detected_bank = None
 class DatabaseManager:
     """Handles all database operations for user profiles and financial data"""
     def __init__(self):
         self.db_path = 'financial_analysis.db'
         self.init_database()
             CREATE TABLE IF NOT EXISTS financial_data (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 user_id TEXT NOT NULL,
+                bank_name TEXT,
                 date TEXT NOT NULL,
                 description TEXT,
                 amount REAL,
             CREATE TABLE IF NOT EXISTS recommendations (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 user_id TEXT NOT NULL,
+                bank_name TEXT,
                 recommendation_type TEXT,
                 title TEXT,
                 description TEXT,
         conn.close()
         if result:
+            columns = ['id', 'user_id', 'name', 'email', 'password_hash', 'financial_goals',
+                      'risk_tolerance', 'monthly_income', 'created_at']
             return dict(zip(columns, result))
         return None
 class PersonalizationEngine:
     """Advanced personalization and recommendation system"""
     def __init__(self, user_profile=None):
         self.user_profile = user_profile
         self.scaler = StandardScaler()
 class AdvancedAnalytics:
     """Advanced analytics and ML models for financial analysis"""
     def __init__(self):
         self.models = {}
         self.scaler = StandardScaler()
             'Predicted_Amount': future_predictions
         })
+    def enhanced_loan_prediction(self, df, bank_name):
+        """Enhanced loan eligibility prediction"""
         try:
             # Load training data
+            training_data = pd.read_csv(f'{bank_name.lower().replace(" ", "_")}.csv')
+            # Prepare features from the transaction data
             total_credits = df[df['Amount'] > 0]['Amount'].sum()
             total_debits = abs(df[df['Amount'] < 0]['Amount'].sum())
             num_transactions = len(df)
+            avg_balance = df['Balance'].mean()
+            closing_balance = df['Balance'].iloc[-1] if len(df) > 0 else 0
             # Prepare feature vector
             features = pd.DataFrame({
+                'Total_Credits': [total_credits],
+                'Total_Debits': [total_debits],
+                'Average_Balance': [avg_balance],
+                'Num_Transactions': [num_transactions],
+                'Closing_Balance': [closing_balance]
             })
+            # Ensure we only use columns that exist in training data
+            available_columns = [col for col in features.columns if col in training_data.columns]
+            features = features[available_columns]
+            # Train model
+            model = RandomForestClassifier(n_estimators=100, random_state=42)
+            X_train = training_data[available_columns]
+            y_train = training_data['Eligibility']
+            model.fit(X_train, y_train)
+            prediction = model.predict(features)[0]
+            prediction_proba = model.predict_proba(features)[0]
+            return {
+                'eligible': bool(prediction),
+                'confidence': float(max(prediction_proba)),
+                'model_type': 'Random Forest'
+            }
         except Exception as e:
             st.error(f"Error in loan prediction: {str(e)}")
             return {'eligible': False, 'confidence': 0.0, 'model_type': 'Error'}
+def extract_text_from_pdf(file):
+    """Extract text from PDF file"""
+    try:
+        if isinstance(file, str):
+            with open(file, 'rb') as f:
+                reader = PyPDF2.PdfReader(f)
+                text = ''
+                for page in reader.pages:
+                    text += page.extract_text()
+        else:
+            reader = PyPDF2.PdfReader(file)
+            text = ''
+            for page in reader.pages:
+                text += page.extract_text()
+        return text
+    except Exception as e:
+        st.error(f"Error extracting text from PDF: {str(e)}")
+        return ""
+def identify_bank_from_text(text):
+    """Identify bank from statement text"""
+    text_lower = text.lower()
+    bank_keywords = {
+        'FNB': ['fnb', 'first national bank'],
+        'Standard Bank': ['standard bank'],
+        'Nedbank': ['nedbank'],
+        'ABSA': ['absa'],
+        'Capitec Bank': ['capitec']
+    }
+    for bank, keywords in bank_keywords.items():
+        if any(keyword in text_lower for keyword in keywords):
+            return bank
+    return None
+def extract_bank_statement_metadata(text, bank_name):
+    """Extract metadata from bank statement based on bank"""
+    if bank_name == 'FNB':
+        # Extract account holder name
+        name_match = re.search(r"(MR|MRS)\s+([A-Z\s]+)", text)
+        account_holder_name = name_match.group(0) if name_match else "Name not found"
+        # Extract closing balance
+        closing_balance_match = re.search(r"Closing Balance\s+([\d,]+\.?\d*)", text)
+        closing_balance = float(closing_balance_match.group(1).replace(',', '')) if closing_balance_match else 0.0
+        return account_holder_name, closing_balance
+    elif bank_name == 'Standard Bank':
+        name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
+        account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
+        acc_match = re.search(r"Account Number:\s*(\d+)", text)
+        account_number = acc_match.group(1) if acc_match else "Not found"
+        period_match = re.search(r"Statement Period:\s*(.+?)\n", text)
+        statement_period = period_match.group(1).strip() if period_match else "Not specified"
+        balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
+        closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
+        return account_holder_name, account_number, statement_period, closing_balance
+    elif bank_name == 'Nedbank':
+        name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
+        account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
+        acc_match = re.search(r"Account Number:\s*(\d+)", text)
+        account_number = acc_match.group(1) if acc_match else "Not found"
+        period_match = re.search(r"Statement Period:\s*(.+?)\n", text)
+        statement_period = period_match.group(1).strip() if period_match else "Not specified"
+        balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
+        closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
+        return account_holder_name, account_number, statement_period, closing_balance
+    elif bank_name == 'ABSA':
+        name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
+        account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
+        acc_match = re.search(r"Account Number:\s*(\d+)", text)
+        account_number = acc_match.group(1) if acc_match else "Not found"
+        period_match = re.search(r"Statement Period:\s*(.+?)\n", text)
+        statement_period = period_match.group(1).strip() if period_match else "Not specified"
+        balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
+        closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
+        return account_holder_name, account_number, statement_period, closing_balance
+    else:  # Default for other banks
+        name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
+        account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
+        balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
+        closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
+        return account_holder_name, closing_balance
 def parse_pdf_enhanced(file):
     """Enhanced PDF parsing with better text extraction"""
     try:
             for page in pdf.pages:
                 page_text = page.extract_text()
                 if page_text:
+                    text += page_text + '\n'
         return text
     except Exception as e:
         st.error(f"Error parsing PDF: {str(e)}")
         return ""
+def process_text_to_df_enhanced(text, bank_name):
+    """Process text to DataFrame based on bank format"""
     transactions = []
+    if bank_name == 'FNB':
+        transaction_pattern = re.compile(
+            r'(\d{2} \w{3})\s+'  # Date (e.g., "02 Apr")
+            r'(.+?)\s+'  # Description
+            r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)(Cr|Dr)?\s*'  # Amount
+            r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)(Cr|Dr)?'  # Balance
+        )
+        for line in text.split('\n'):
+            line = line.strip()
+            if not line or 'Transactions in RAND' in line or 'Date Description' in line:
+                continue
+            match = transaction_pattern.search(line)
+            if match:
+                try:
+                    groups = match.groups()
+                    date_str, description, amount_str, cr_dr1, balance_str, _ = groups
+                    # Convert date to standard format
+                    current_year = datetime.now().year
+                    date_obj = datetime.strptime(f"{date_str} {current_year}", "%d %b %Y")
+                    date_str = date_obj.strftime("%Y-%m-%d")
+                    # Clean and convert amounts
+                    amount = float(amount_str.replace(',', ''))
+                    if cr_dr1 == 'Cr':
+                        amount = -amount  # Credits are negative in our system
+                    balance = float(balance_str.replace(',', ''))
+                    transactions.append([date_str, description.strip(), amount, balance])
+                except (ValueError, AttributeError):
+                    continue
+    elif bank_name == 'Standard Bank':
+        transaction_pattern = re.compile(
+            r'(\d{2} \w{3} \d{2})\s+'  # Date (e.g., "02 Apr 22")
+            r'(.+?)\s+'  # Description
+            r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+'  # Amount
+            r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)'  # Balance
+        )
+        for line in text.split('\n'):
+            line = line.strip()
+            if not line or 'Date Description' in line or 'Transactions in RAND' in line:
+                continue
+            match = transaction_pattern.search(line)
+            if match:
+                try:
+                    date_str, description, amount_str, balance_str = match.groups()
+                    # Convert date to standard format
+                    date_obj = datetime.strptime(date_str, '%d %b %y')
+                    date_str = date_obj.strftime('%Y-%m-%d')
+                    # Clean and convert amounts
+                    amount = float(amount_str.replace(',', ''))
+                    balance = float(balance_str.replace(',', ''))
+                    transactions.append([date_str, description.strip(), amount, balance])
+                except (ValueError, AttributeError):
+                    continue
+    elif bank_name == 'Nedbank':
+        transaction_pattern = re.compile(
+            r'(\d{2}/\d{2}/\d{4})\s+'  # Date (e.g., "02/04/2022")
+            r'(.+?)\s+'  # Description
+            r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+'  # Amount
+            r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)'  # Balance
+        )
+        for line in text.split('\n'):
+            line = line.strip()
+            if not line or 'Date Description' in line or 'Transactions in RAND' in line:
+                continue
+            match = transaction_pattern.search(line)
+            if match:
+                try:
+                    date_str, description, amount_str, balance_str = match.groups()
+                    # Clean and convert amounts
+                    amount = float(amount_str.replace(',', '').replace('R', '').replace(' ', ''))
+                    balance = float(balance_str.replace(',', '').replace('R', '').replace(' ', ''))
+                    transactions.append([date_str, description.strip(), amount, balance])
+                except (ValueError, AttributeError):
+                    continue
+    else:  # Default pattern for other banks
+        transaction_pattern = re.compile(
+            r'(\d{4}-\d{2}-\d{2})\s+'  # Date
+            r'(.+?)\s+'  # Description
+            r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+'  # Amount
+            r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)'  # Balance
+        )
+        for line in text.split('\n'):
+            line = line.strip()
+            if not line:
+                continue
+            match = transaction_pattern.search(line)
+            if match:
+                try:
+                    date_str, description, amount_str, balance_str = match.groups()
+                    # Clean and convert amounts
+                    amount = float(amount_str.replace(',', '').replace('R', '').replace(' ', ''))
+                    balance = float(balance_str.replace(',', '').replace('R', '').replace(' ', ''))
+                    transactions.append([date_str, description.strip(), amount, balance])
+                except (ValueError, AttributeError):
+                    continue
     if not transactions:
         return pd.DataFrame(columns=['Date', 'Description', 'Amount', 'Balance'])
     df = pd.DataFrame(transactions, columns=['Date', 'Description', 'Amount', 'Balance'])
     df['Date'] = pd.to_datetime(df['Date'])
     df = df.sort_values('Date').reset_index(drop=True)
+    df['Bank'] = bank_name  # Add bank name column
     return df
+def categorize_expense_enhanced(description, bank_name):
+    """Enhanced expense categorization with bank-specific rules"""
     description_lower = description.lower()
+    # Common categories across all banks
+    common_categories = {
         'Salary/Income': ['salary', 'wage', 'income', 'payroll', 'refund'],
         'Groceries': ['grocery', 'supermarket', 'food', 'spar', 'checkers', 'woolworths'],
         'Transport': ['fuel', 'petrol', 'uber', 'taxi', 'transport', 'car payment'],
         'Shopping': ['retail', 'clothing', 'amazon', 'takealot', 'mall'],
         'Investments': ['investment', 'shares', 'unit trust', 'retirement'],
         'Insurance': ['insurance', 'medical aid', 'life cover', 'short term'],
+        'Bank Charges': ['fees', 'charge', 'service', 'cost', 'monthly account fee'],
+        'Cash Transactions': ['atm', 'cash', 'withdrawal', 'deposit'],
         'Cellular': ['airtime', 'data', 'vodacom', 'mtn', 'cell c'],
         'Interest': ['interest'],
         'Failed Transactions': ['unsuccessful', 'declined', 'failed']
     }
+    # Bank-specific categories
+    bank_specific = {
+        'FNB': {
+            'POS Purchases': ['pos purchase', 'card purchase', 'debit order'],
+            'Payments': ['payment to', 'fnb app rtc pmt', 'internet pmt', 'debit order']
+        },
+        'Standard Bank': {
+            'Payments': ['payment', 'transfer', 'debit order', 'immediate trf', 'digital payment']
+        },
+        'Nedbank': {
+            'POS Purchases': ['pos purchase', 'card purchase', 'debit order', 'cashsend'],
+            'Payments': ['payment to', 'immediate trf', 'digital payment', 'pmt to'],
+            'Loans': ['loan payment', 'nedloan', 'personal loan']
+        },
+        'ABSA': {
+            'POS Purchases': ['cashsend mobile', 'pos purchase'],
+            'Payments': ['immediate trf', 'digital payment', 'payment']
+        }
+    }
+    # Combine common and bank-specific categories
+    category_keywords = {**common_categories, **(bank_specific.get(bank_name, {}))}
     # Check for specific keywords
     for category, keywords in category_keywords.items():
         if any(keyword in description_lower for keyword in keywords):
     except:
         pass
+    return 'Other'
 def create_advanced_visualizations(df, patterns, recommendations):
     """Create advanced interactive visualizations"""
     # 1. Financial Health Dashboard
     col1, col2, col3, col4 = st.columns(4)
 def main():
     """Main application function"""
     # Initialize database
     db = DatabaseManager()
     # Sidebar for user authentication
     with st.sidebar:
+        st.title("🏦 Universal Bank Analyzer")
         if st.session_state.user_profile is None:
             tab1, tab2 = st.tabs(["Login", "Sign Up"])
                 st.session_state.user_profile = None
                 st.session_state.transactions_df = None
                 st.session_state.analysis_complete = False
+                st.session_state.detected_bank = None
                 st.experimental_rerun()
     # Main content area
     if st.session_state.user_profile is None:
         st.markdown("""
+        # 🏦 Universal Bank Statement Analysis
+        ### Welcome to the next generation of financial analysis for all major banks!
+        **Supported Banks:**
+        - FNB (First National Bank)
+        - Standard Bank
+        - Nedbank
+        - ABSA
+        - Capitec Bank
         **Key Features:**
         - 🤖 **AI-Powered Insights**: Advanced machine learning for personalized recommendations
         - 🔮 **Predictive Analysis**: Forecast future spending trends
         - 🛡️ **Anomaly Detection**: Identify unusual transactions
         - 💡 **Smart Recommendations**: Personalized financial advice
+        - 💰 **Loan Eligibility**: Check your loan eligibility instantly
         **Please login or create an account to get started.**
         """)
         return
     # Main analysis interface
+    st.title(f"🏦 Universal Bank Analysis Dashboard - {st.session_state.user_profile['name']}")
     # File upload section
     st.markdown("### 📄 Upload Your Bank Statement")
     uploaded_file = st.file_uploader(
         "Choose a PDF bank statement",
         type="pdf",
+        help="Upload your bank statement in PDF format for analysis"
     )
     if uploaded_file is not None:
         try:
+            # Parse PDF and identify bank
+            with st.spinner("🔍 Analyzing bank statement..."):
+                text = extract_text_from_pdf(uploaded_file)
+                bank_name = identify_bank_from_text(text)
+                if not bank_name:
+                    st.error("Could not identify bank from statement. Please ensure it's from a supported bank.")
+                    return
+                st.session_state.detected_bank = bank_name
+                st.success(f"Detected Bank: {bank_name}")
+                # Extract metadata based on bank
+                metadata = extract_bank_statement_metadata(text, bank_name)
+                # Parse transactions
+                df = process_text_to_df_enhanced(text, bank_name)
             if df.empty:
                 st.warning("⚠️ No transactions found in the uploaded statement. Please check the file format.")
             st.session_state.transactions_df = df
             # Enhance data with categories
+            df['Category'] = df['Description'].apply(lambda x: categorize_expense_enhanced(x, bank_name))
             # Initialize analytics engines
             personalization = PersonalizationEngine(st.session_state.user_profile)
                 patterns = personalization.analyze_spending_patterns(df)
                 recommendations = personalization.generate_personalized_recommendations(df, patterns)
                 health_score, score_components = personalization.calculate_financial_health_score(df, patterns)
+                loan_prediction = analytics.enhanced_loan_prediction(df, bank_name)
                 df_with_anomalies = analytics.detect_anomalies(df)
             st.session_state.analysis_complete = True
+            # Display account info
+            if bank_name == 'FNB':
+                account_holder_name, closing_balance = metadata
+                st.success(f"Account Holder: {account_holder_name}")
+                st.info(f"Closing Balance: R{closing_balance:,.2f}")
+            else:
+                account_holder_name, account_number, statement_period, closing_balance = metadata
+                st.success(f"Account Holder: {account_holder_name}")
+                st.info(f"Account Number: {account_number} | Statement Period: {statement_period}")
+                st.info(f"Closing Balance: R{closing_balance:,.2f}")
             # Display results in tabs
             tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
                 "📊 Overview", "💡 Recommendations", "🏥 Health Score",
         except Exception as e:
             st.error(f"❌ An error occurred while processing your statement: {str(e)}")
+            st.info("Please ensure your PDF is a valid bank statement and try again.")
 if __name__ == "__main__":
     main()
         color: white;
     }
 </style>
+""", unsafe_allow_html=True)