SHIKARICHACHA commited on
Commit
0474efa
ยท
verified ยท
1 Parent(s): c68a3b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +335 -112
app.py CHANGED
@@ -4,12 +4,18 @@ import re
4
  import pandas as pd
5
  import plotly.express as px
6
  import plotly.graph_objects as go
7
- from plotly.subplots import make_subplots
8
  import numpy as np
9
  from datetime import datetime, timedelta
10
  import sqlite3
11
  import hashlib
12
- import json
 
 
 
 
 
 
 
13
  from http.server import HTTPServer, SimpleHTTPRequestHandler
14
 
15
  # Advanced ML and NLP imports
@@ -26,7 +32,7 @@ except ImportError as e:
26
 
27
  # Configuration
28
  st.set_page_config(
29
- page_title='Enhanced Bank Statement Analysis',
30
  page_icon='๐Ÿฆ',
31
  layout='wide',
32
  initial_sidebar_state='expanded'
@@ -39,10 +45,11 @@ if 'transactions_df' not in st.session_state:
39
  st.session_state.transactions_df = None
40
  if 'analysis_complete' not in st.session_state:
41
  st.session_state.analysis_complete = False
 
 
42
 
43
  class DatabaseManager:
44
  """Handles all database operations for user profiles and financial data"""
45
-
46
  def __init__(self):
47
  self.db_path = 'financial_analysis.db'
48
  self.init_database()
@@ -89,6 +96,7 @@ class DatabaseManager:
89
  CREATE TABLE IF NOT EXISTS financial_data (
90
  id INTEGER PRIMARY KEY AUTOINCREMENT,
91
  user_id TEXT NOT NULL,
 
92
  date TEXT NOT NULL,
93
  description TEXT,
94
  amount REAL,
@@ -104,6 +112,7 @@ class DatabaseManager:
104
  CREATE TABLE IF NOT EXISTS recommendations (
105
  id INTEGER PRIMARY KEY AUTOINCREMENT,
106
  user_id TEXT NOT NULL,
 
107
  recommendation_type TEXT,
108
  title TEXT,
109
  description TEXT,
@@ -158,13 +167,13 @@ class DatabaseManager:
158
  conn.close()
159
 
160
  if result:
161
- columns = ['id', 'user_id', 'name', 'email', 'password_hash', 'financial_goals', 'risk_tolerance', 'monthly_income', 'created_at']
 
162
  return dict(zip(columns, result))
163
  return None
164
 
165
  class PersonalizationEngine:
166
  """Advanced personalization and recommendation system"""
167
-
168
  def __init__(self, user_profile=None):
169
  self.user_profile = user_profile
170
  self.scaler = StandardScaler()
@@ -293,7 +302,6 @@ class PersonalizationEngine:
293
 
294
  class AdvancedAnalytics:
295
  """Advanced analytics and ML models for financial analysis"""
296
-
297
  def __init__(self):
298
  self.models = {}
299
  self.scaler = StandardScaler()
@@ -347,85 +355,153 @@ class AdvancedAnalytics:
347
  'Predicted_Amount': future_predictions
348
  })
349
 
350
- def enhanced_loan_prediction(self, df):
351
- """Enhanced loan eligibility prediction using XGBoost"""
352
  try:
353
  # Load training data
354
- training_data = pd.read_csv('absa.csv')
355
 
356
- # Prepare features
357
  total_credits = df[df['Amount'] > 0]['Amount'].sum()
358
  total_debits = abs(df[df['Amount'] < 0]['Amount'].sum())
359
  num_transactions = len(df)
360
-
361
- # Advanced features
362
- avg_transaction_amount = df['Amount'].mean()
363
- transaction_variability = df['Amount'].std()
364
- balance_trend = df['Balance'].iloc[-1] - df['Balance'].iloc[0] if len(df) > 1 else 0
365
-
366
- # Additional features
367
- credit_frequency = len(df[df['Amount'] > 0]) / max(1, len(df))
368
- max_single_debit = abs(df[df['Amount'] < 0]['Amount'].min()) if len(df[df['Amount'] < 0]) > 0 else 0
369
- balance_volatility = df['Balance'].std()
370
 
371
  # Prepare feature vector
372
  features = pd.DataFrame({
373
- 'total_credits': [total_credits],
374
- 'total_debits': [total_debits],
375
- 'num_transactions': [num_transactions],
376
- 'avg_transaction_amount': [avg_transaction_amount],
377
- 'transaction_variability': [transaction_variability],
378
- 'balance_trend': [balance_trend],
379
- 'credit_frequency': [credit_frequency],
380
- 'max_single_debit': [max_single_debit],
381
- 'balance_volatility': [balance_volatility]
382
  })
383
 
384
- # Train XGBoost model if available
385
- if 'xgb' in globals():
386
- X_train = training_data[['total_credits', 'total_debits', 'num_transactions',
387
- 'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
388
- y_train = training_data['Eligibility (y)']
389
-
390
- # Add missing features with defaults
391
- for col in features.columns:
392
- if col not in X_train.columns:
393
- X_train[col] = 0
394
 
395
- model = xgb.XGBClassifier(random_state=42)
396
- model.fit(X_train[features.columns], y_train)
 
 
 
397
 
398
- prediction = model.predict(features)[0]
399
- prediction_proba = model.predict_proba(features)[0]
400
 
401
- return {
402
- 'eligible': bool(prediction),
403
- 'confidence': float(max(prediction_proba)),
404
- 'model_type': 'XGBoost'
405
- }
406
- else:
407
- # Fallback to Random Forest
408
- model = RandomForestClassifier(n_estimators=100, random_state=42)
409
- X_train = training_data[['total_credits', 'total_debits', 'num_transactions',
410
- 'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
411
- y_train = training_data['Eligibility (y)']
412
- model.fit(X_train, y_train)
413
-
414
- features_basic = features[['total_credits', 'total_debits', 'num_transactions',
415
- 'avg_transaction_amount', 'transaction_variability', 'balance_trend']]
416
- prediction = model.predict(features_basic)[0]
417
- prediction_proba = model.predict_proba(features_basic)[0]
418
-
419
- return {
420
- 'eligible': bool(prediction),
421
- 'confidence': float(max(prediction_proba)),
422
- 'model_type': 'Random Forest'
423
- }
424
 
425
  except Exception as e:
426
  st.error(f"Error in loan prediction: {str(e)}")
427
  return {'eligible': False, 'confidence': 0.0, 'model_type': 'Error'}
428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  def parse_pdf_enhanced(file):
430
  """Enhanced PDF parsing with better text extraction"""
431
  try:
@@ -434,36 +510,132 @@ def parse_pdf_enhanced(file):
434
  for page in pdf.pages:
435
  page_text = page.extract_text()
436
  if page_text:
437
- text += page_text
438
  return text
439
  except Exception as e:
440
  st.error(f"Error parsing PDF: {str(e)}")
441
  return ""
442
 
443
- def process_text_to_df_enhanced(text):
444
- """Enhanced text processing with better pattern recognition"""
445
  transactions = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
- # Use the exact working pattern from app_original.py as primary
448
- transaction_pattern = re.compile(r'(\d{4}-\d{2}-\d{2})\s+(.+?)\s+(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
- for line in text.split('\n'):
451
- line = line.strip()
452
- if not line:
453
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
- match = transaction_pattern.search(line)
456
- if match:
457
- try:
458
- date_str, description, amount_str, balance_str = match.groups()
459
 
460
- # Clean and convert amounts (same logic as app_original.py)
461
- amount = float(amount_str.replace(',', '').replace('R', '').replace(' ', ''))
462
- balance = float(balance_str.replace(',', '').replace('R', '').replace(' ', ''))
463
 
464
- transactions.append([date_str, description.strip(), amount, balance])
465
- except (ValueError, AttributeError):
466
- continue
467
 
468
  if not transactions:
469
  return pd.DataFrame(columns=['Date', 'Description', 'Amount', 'Balance'])
@@ -471,15 +643,16 @@ def process_text_to_df_enhanced(text):
471
  df = pd.DataFrame(transactions, columns=['Date', 'Description', 'Amount', 'Balance'])
472
  df['Date'] = pd.to_datetime(df['Date'])
473
  df = df.sort_values('Date').reset_index(drop=True)
 
474
 
475
  return df
476
 
477
- def categorize_expense_enhanced(description):
478
- """Enhanced expense categorization using NLP"""
479
  description_lower = description.lower()
480
 
481
- # Enhanced categorization with more sophisticated rules
482
- category_keywords = {
483
  'Salary/Income': ['salary', 'wage', 'income', 'payroll', 'refund'],
484
  'Groceries': ['grocery', 'supermarket', 'food', 'spar', 'checkers', 'woolworths'],
485
  'Transport': ['fuel', 'petrol', 'uber', 'taxi', 'transport', 'car payment'],
@@ -489,16 +662,36 @@ def categorize_expense_enhanced(description):
489
  'Shopping': ['retail', 'clothing', 'amazon', 'takealot', 'mall'],
490
  'Investments': ['investment', 'shares', 'unit trust', 'retirement'],
491
  'Insurance': ['insurance', 'medical aid', 'life cover', 'short term'],
492
- 'POS Purchases': ['cashsend mobile', 'pos purchase'],
493
- 'Payments': ['immediate trf', 'digital payment', 'payment'],
494
- 'Credits': ['acb credit', 'immediate trf cr', 'credit'],
495
- 'Bank Charges': ['fees', 'charge', 'commission'],
496
- 'Cash Transactions': ['atm', 'cash deposit', 'withdrawal'],
497
  'Cellular': ['airtime', 'data', 'vodacom', 'mtn', 'cell c'],
498
  'Interest': ['interest'],
499
  'Failed Transactions': ['unsuccessful', 'declined', 'failed']
500
  }
501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  # Check for specific keywords
503
  for category, keywords in category_keywords.items():
504
  if any(keyword in description_lower for keyword in keywords):
@@ -513,11 +706,10 @@ def categorize_expense_enhanced(description):
513
  except:
514
  pass
515
 
516
- return 'Others'
517
 
518
  def create_advanced_visualizations(df, patterns, recommendations):
519
  """Create advanced interactive visualizations"""
520
-
521
  # 1. Financial Health Dashboard
522
  col1, col2, col3, col4 = st.columns(4)
523
 
@@ -589,13 +781,12 @@ def create_advanced_visualizations(df, patterns, recommendations):
589
 
590
  def main():
591
  """Main application function"""
592
-
593
  # Initialize database
594
  db = DatabaseManager()
595
 
596
  # Sidebar for user authentication
597
  with st.sidebar:
598
- st.title("๐Ÿฆ Financial Analysis")
599
 
600
  if st.session_state.user_profile is None:
601
  tab1, tab2 = st.tabs(["Login", "Sign Up"])
@@ -641,14 +832,22 @@ def main():
641
  st.session_state.user_profile = None
642
  st.session_state.transactions_df = None
643
  st.session_state.analysis_complete = False
 
644
  st.experimental_rerun()
645
 
646
  # Main content area
647
  if st.session_state.user_profile is None:
648
  st.markdown("""
649
- # ๐Ÿฆ Enhanced Bank Statement Analysis
 
 
650
 
651
- ### Welcome to the next generation of financial analysis!
 
 
 
 
 
652
 
653
  **Key Features:**
654
  - ๐Ÿค– **AI-Powered Insights**: Advanced machine learning for personalized recommendations
@@ -657,29 +856,42 @@ def main():
657
  - ๐Ÿ”ฎ **Predictive Analysis**: Forecast future spending trends
658
  - ๐Ÿ›ก๏ธ **Anomaly Detection**: Identify unusual transactions
659
  - ๐Ÿ’ก **Smart Recommendations**: Personalized financial advice
 
660
 
661
  **Please login or create an account to get started.**
662
  """)
663
-
664
  return
665
 
666
  # Main analysis interface
667
- st.title(f"๐Ÿฆ Financial Analysis Dashboard - {st.session_state.user_profile['name']}")
668
 
669
  # File upload section
670
  st.markdown("### ๐Ÿ“„ Upload Your Bank Statement")
671
  uploaded_file = st.file_uploader(
672
  "Choose a PDF bank statement",
673
  type="pdf",
674
- help="Upload your ABSA bank statement in PDF format for analysis"
675
  )
676
 
677
  if uploaded_file is not None:
678
  try:
679
- # Parse PDF
680
- with st.spinner("๐Ÿ” Parsing bank statement..."):
681
- text = parse_pdf_enhanced(uploaded_file)
682
- df = process_text_to_df_enhanced(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
684
  if df.empty:
685
  st.warning("โš ๏ธ No transactions found in the uploaded statement. Please check the file format.")
@@ -689,7 +901,7 @@ def main():
689
  st.session_state.transactions_df = df
690
 
691
  # Enhance data with categories
692
- df['Category'] = df['Description'].apply(categorize_expense_enhanced)
693
 
694
  # Initialize analytics engines
695
  personalization = PersonalizationEngine(st.session_state.user_profile)
@@ -700,11 +912,22 @@ def main():
700
  patterns = personalization.analyze_spending_patterns(df)
701
  recommendations = personalization.generate_personalized_recommendations(df, patterns)
702
  health_score, score_components = personalization.calculate_financial_health_score(df, patterns)
703
- loan_prediction = analytics.enhanced_loan_prediction(df)
704
  df_with_anomalies = analytics.detect_anomalies(df)
705
 
706
  st.session_state.analysis_complete = True
707
 
 
 
 
 
 
 
 
 
 
 
 
708
  # Display results in tabs
709
  tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
710
  "๐Ÿ“Š Overview", "๐Ÿ’ก Recommendations", "๐Ÿฅ Health Score",
@@ -902,7 +1125,7 @@ def main():
902
 
903
  except Exception as e:
904
  st.error(f"โŒ An error occurred while processing your statement: {str(e)}")
905
- st.info("Please ensure your PDF is a valid ABSA bank statement and try again.")
906
 
907
  if __name__ == "__main__":
908
  main()
@@ -944,4 +1167,4 @@ st.markdown("""
944
  color: white;
945
  }
946
  </style>
947
- """, unsafe_allow_html=True)
 
4
  import pandas as pd
5
  import plotly.express as px
6
  import plotly.graph_objects as go
 
7
  import numpy as np
8
  from datetime import datetime, timedelta
9
  import sqlite3
10
  import hashlib
11
+ import tempfile
12
+ import PyPDF2
13
+ import os
14
+ import webbrowser
15
+ import threading
16
+ import uuid
17
+ import subprocess
18
+ import time
19
  from http.server import HTTPServer, SimpleHTTPRequestHandler
20
 
21
  # Advanced ML and NLP imports
 
32
 
33
  # Configuration
34
  st.set_page_config(
35
+ page_title='Universal Bank Statement Analyzer',
36
  page_icon='๐Ÿฆ',
37
  layout='wide',
38
  initial_sidebar_state='expanded'
 
45
  st.session_state.transactions_df = None
46
  if 'analysis_complete' not in st.session_state:
47
  st.session_state.analysis_complete = False
48
+ if 'detected_bank' not in st.session_state:
49
+ st.session_state.detected_bank = None
50
 
51
  class DatabaseManager:
52
  """Handles all database operations for user profiles and financial data"""
 
53
  def __init__(self):
54
  self.db_path = 'financial_analysis.db'
55
  self.init_database()
 
96
  CREATE TABLE IF NOT EXISTS financial_data (
97
  id INTEGER PRIMARY KEY AUTOINCREMENT,
98
  user_id TEXT NOT NULL,
99
+ bank_name TEXT,
100
  date TEXT NOT NULL,
101
  description TEXT,
102
  amount REAL,
 
112
  CREATE TABLE IF NOT EXISTS recommendations (
113
  id INTEGER PRIMARY KEY AUTOINCREMENT,
114
  user_id TEXT NOT NULL,
115
+ bank_name TEXT,
116
  recommendation_type TEXT,
117
  title TEXT,
118
  description TEXT,
 
167
  conn.close()
168
 
169
  if result:
170
+ columns = ['id', 'user_id', 'name', 'email', 'password_hash', 'financial_goals',
171
+ 'risk_tolerance', 'monthly_income', 'created_at']
172
  return dict(zip(columns, result))
173
  return None
174
 
175
  class PersonalizationEngine:
176
  """Advanced personalization and recommendation system"""
 
177
  def __init__(self, user_profile=None):
178
  self.user_profile = user_profile
179
  self.scaler = StandardScaler()
 
302
 
303
  class AdvancedAnalytics:
304
  """Advanced analytics and ML models for financial analysis"""
 
305
  def __init__(self):
306
  self.models = {}
307
  self.scaler = StandardScaler()
 
355
  'Predicted_Amount': future_predictions
356
  })
357
 
358
+ def enhanced_loan_prediction(self, df, bank_name):
359
+ """Enhanced loan eligibility prediction"""
360
  try:
361
  # Load training data
362
+ training_data = pd.read_csv(f'{bank_name.lower().replace(" ", "_")}.csv')
363
 
364
+ # Prepare features from the transaction data
365
  total_credits = df[df['Amount'] > 0]['Amount'].sum()
366
  total_debits = abs(df[df['Amount'] < 0]['Amount'].sum())
367
  num_transactions = len(df)
368
+ avg_balance = df['Balance'].mean()
369
+ closing_balance = df['Balance'].iloc[-1] if len(df) > 0 else 0
 
 
 
 
 
 
 
 
370
 
371
  # Prepare feature vector
372
  features = pd.DataFrame({
373
+ 'Total_Credits': [total_credits],
374
+ 'Total_Debits': [total_debits],
375
+ 'Average_Balance': [avg_balance],
376
+ 'Num_Transactions': [num_transactions],
377
+ 'Closing_Balance': [closing_balance]
 
 
 
 
378
  })
379
 
380
+ # Ensure we only use columns that exist in training data
381
+ available_columns = [col for col in features.columns if col in training_data.columns]
382
+ features = features[available_columns]
 
 
 
 
 
 
 
383
 
384
+ # Train model
385
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
386
+ X_train = training_data[available_columns]
387
+ y_train = training_data['Eligibility']
388
+ model.fit(X_train, y_train)
389
 
390
+ prediction = model.predict(features)[0]
391
+ prediction_proba = model.predict_proba(features)[0]
392
 
393
+ return {
394
+ 'eligible': bool(prediction),
395
+ 'confidence': float(max(prediction_proba)),
396
+ 'model_type': 'Random Forest'
397
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  except Exception as e:
400
  st.error(f"Error in loan prediction: {str(e)}")
401
  return {'eligible': False, 'confidence': 0.0, 'model_type': 'Error'}
402
 
403
+ def extract_text_from_pdf(file):
404
+ """Extract text from PDF file"""
405
+ try:
406
+ if isinstance(file, str):
407
+ with open(file, 'rb') as f:
408
+ reader = PyPDF2.PdfReader(f)
409
+ text = ''
410
+ for page in reader.pages:
411
+ text += page.extract_text()
412
+ else:
413
+ reader = PyPDF2.PdfReader(file)
414
+ text = ''
415
+ for page in reader.pages:
416
+ text += page.extract_text()
417
+ return text
418
+ except Exception as e:
419
+ st.error(f"Error extracting text from PDF: {str(e)}")
420
+ return ""
421
+
422
+ def identify_bank_from_text(text):
423
+ """Identify bank from statement text"""
424
+ text_lower = text.lower()
425
+ bank_keywords = {
426
+ 'FNB': ['fnb', 'first national bank'],
427
+ 'Standard Bank': ['standard bank'],
428
+ 'Nedbank': ['nedbank'],
429
+ 'ABSA': ['absa'],
430
+ 'Capitec Bank': ['capitec']
431
+ }
432
+
433
+ for bank, keywords in bank_keywords.items():
434
+ if any(keyword in text_lower for keyword in keywords):
435
+ return bank
436
+ return None
437
+
438
+ def extract_bank_statement_metadata(text, bank_name):
439
+ """Extract metadata from bank statement based on bank"""
440
+ if bank_name == 'FNB':
441
+ # Extract account holder name
442
+ name_match = re.search(r"(MR|MRS)\s+([A-Z\s]+)", text)
443
+ account_holder_name = name_match.group(0) if name_match else "Name not found"
444
+
445
+ # Extract closing balance
446
+ closing_balance_match = re.search(r"Closing Balance\s+([\d,]+\.?\d*)", text)
447
+ closing_balance = float(closing_balance_match.group(1).replace(',', '')) if closing_balance_match else 0.0
448
+
449
+ return account_holder_name, closing_balance
450
+
451
+ elif bank_name == 'Standard Bank':
452
+ name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
453
+ account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
454
+
455
+ acc_match = re.search(r"Account Number:\s*(\d+)", text)
456
+ account_number = acc_match.group(1) if acc_match else "Not found"
457
+
458
+ period_match = re.search(r"Statement Period:\s*(.+?)\n", text)
459
+ statement_period = period_match.group(1).strip() if period_match else "Not specified"
460
+
461
+ balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
462
+ closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
463
+
464
+ return account_holder_name, account_number, statement_period, closing_balance
465
+
466
+ elif bank_name == 'Nedbank':
467
+ name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
468
+ account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
469
+
470
+ acc_match = re.search(r"Account Number:\s*(\d+)", text)
471
+ account_number = acc_match.group(1) if acc_match else "Not found"
472
+
473
+ period_match = re.search(r"Statement Period:\s*(.+?)\n", text)
474
+ statement_period = period_match.group(1).strip() if period_match else "Not specified"
475
+
476
+ balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
477
+ closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
478
+
479
+ return account_holder_name, account_number, statement_period, closing_balance
480
+
481
+ elif bank_name == 'ABSA':
482
+ name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
483
+ account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
484
+
485
+ acc_match = re.search(r"Account Number:\s*(\d+)", text)
486
+ account_number = acc_match.group(1) if acc_match else "Not found"
487
+
488
+ period_match = re.search(r"Statement Period:\s*(.+?)\n", text)
489
+ statement_period = period_match.group(1).strip() if period_match else "Not specified"
490
+
491
+ balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
492
+ closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
493
+
494
+ return account_holder_name, account_number, statement_period, closing_balance
495
+
496
+ else: # Default for other banks
497
+ name_match = re.search(r"Account Holder:\s*(.+?)\n", text)
498
+ account_holder_name = name_match.group(1).strip() if name_match else "Name not found"
499
+
500
+ balance_match = re.search(r"Closing Balance:\s*(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)", text)
501
+ closing_balance = float(balance_match.group(1).replace(',', '').replace('R', '')) if balance_match else 0.0
502
+
503
+ return account_holder_name, closing_balance
504
+
505
  def parse_pdf_enhanced(file):
506
  """Enhanced PDF parsing with better text extraction"""
507
  try:
 
510
  for page in pdf.pages:
511
  page_text = page.extract_text()
512
  if page_text:
513
+ text += page_text + '\n'
514
  return text
515
  except Exception as e:
516
  st.error(f"Error parsing PDF: {str(e)}")
517
  return ""
518
 
519
+ def process_text_to_df_enhanced(text, bank_name):
520
+ """Process text to DataFrame based on bank format"""
521
  transactions = []
522
+
523
+ if bank_name == 'FNB':
524
+ transaction_pattern = re.compile(
525
+ r'(\d{2} \w{3})\s+' # Date (e.g., "02 Apr")
526
+ r'(.+?)\s+' # Description
527
+ r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)(Cr|Dr)?\s*' # Amount
528
+ r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)(Cr|Dr)?' # Balance
529
+ )
530
+
531
+ for line in text.split('\n'):
532
+ line = line.strip()
533
+ if not line or 'Transactions in RAND' in line or 'Date Description' in line:
534
+ continue
535
+
536
+ match = transaction_pattern.search(line)
537
+ if match:
538
+ try:
539
+ groups = match.groups()
540
+ date_str, description, amount_str, cr_dr1, balance_str, _ = groups
541
+
542
+ # Convert date to standard format
543
+ current_year = datetime.now().year
544
+ date_obj = datetime.strptime(f"{date_str} {current_year}", "%d %b %Y")
545
+ date_str = date_obj.strftime("%Y-%m-%d")
546
+
547
+ # Clean and convert amounts
548
+ amount = float(amount_str.replace(',', ''))
549
+ if cr_dr1 == 'Cr':
550
+ amount = -amount # Credits are negative in our system
551
+
552
+ balance = float(balance_str.replace(',', ''))
553
+
554
+ transactions.append([date_str, description.strip(), amount, balance])
555
+ except (ValueError, AttributeError):
556
+ continue
557
+
558
+ elif bank_name == 'Standard Bank':
559
+ transaction_pattern = re.compile(
560
+ r'(\d{2} \w{3} \d{2})\s+' # Date (e.g., "02 Apr 22")
561
+ r'(.+?)\s+' # Description
562
+ r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+' # Amount
563
+ r'(-?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)' # Balance
564
+ )
565
+
566
+ for line in text.split('\n'):
567
+ line = line.strip()
568
+ if not line or 'Date Description' in line or 'Transactions in RAND' in line:
569
+ continue
570
 
571
+ match = transaction_pattern.search(line)
572
+ if match:
573
+ try:
574
+ date_str, description, amount_str, balance_str = match.groups()
575
+
576
+ # Convert date to standard format
577
+ date_obj = datetime.strptime(date_str, '%d %b %y')
578
+ date_str = date_obj.strftime('%Y-%m-%d')
579
+
580
+ # Clean and convert amounts
581
+ amount = float(amount_str.replace(',', ''))
582
+ balance = float(balance_str.replace(',', ''))
583
+
584
+ transactions.append([date_str, description.strip(), amount, balance])
585
+ except (ValueError, AttributeError):
586
+ continue
587
+
588
+ elif bank_name == 'Nedbank':
589
+ transaction_pattern = re.compile(
590
+ r'(\d{2}/\d{2}/\d{4})\s+' # Date (e.g., "02/04/2022")
591
+ r'(.+?)\s+' # Description
592
+ r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+' # Amount
593
+ r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)' # Balance
594
+ )
595
+
596
+ for line in text.split('\n'):
597
+ line = line.strip()
598
+ if not line or 'Date Description' in line or 'Transactions in RAND' in line:
599
+ continue
600
 
601
+ match = transaction_pattern.search(line)
602
+ if match:
603
+ try:
604
+ date_str, description, amount_str, balance_str = match.groups()
605
+
606
+ # Clean and convert amounts
607
+ amount = float(amount_str.replace(',', '').replace('R', '').replace(' ', ''))
608
+ balance = float(balance_str.replace(',', '').replace('R', '').replace(' ', ''))
609
+
610
+ transactions.append([date_str, description.strip(), amount, balance])
611
+ except (ValueError, AttributeError):
612
+ continue
613
+
614
+ else: # Default pattern for other banks
615
+ transaction_pattern = re.compile(
616
+ r'(\d{4}-\d{2}-\d{2})\s+' # Date
617
+ r'(.+?)\s+' # Description
618
+ r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s+' # Amount
619
+ r'(-?R?\d{1,3}(?:,\d{3})*(?:\.\d{2})?)' # Balance
620
+ )
621
+
622
+ for line in text.split('\n'):
623
+ line = line.strip()
624
+ if not line:
625
+ continue
626
 
627
+ match = transaction_pattern.search(line)
628
+ if match:
629
+ try:
630
+ date_str, description, amount_str, balance_str = match.groups()
631
 
632
+ # Clean and convert amounts
633
+ amount = float(amount_str.replace(',', '').replace('R', '').replace(' ', ''))
634
+ balance = float(balance_str.replace(',', '').replace('R', '').replace(' ', ''))
635
 
636
+ transactions.append([date_str, description.strip(), amount, balance])
637
+ except (ValueError, AttributeError):
638
+ continue
639
 
640
  if not transactions:
641
  return pd.DataFrame(columns=['Date', 'Description', 'Amount', 'Balance'])
 
643
  df = pd.DataFrame(transactions, columns=['Date', 'Description', 'Amount', 'Balance'])
644
  df['Date'] = pd.to_datetime(df['Date'])
645
  df = df.sort_values('Date').reset_index(drop=True)
646
+ df['Bank'] = bank_name # Add bank name column
647
 
648
  return df
649
 
650
+ def categorize_expense_enhanced(description, bank_name):
651
+ """Enhanced expense categorization with bank-specific rules"""
652
  description_lower = description.lower()
653
 
654
+ # Common categories across all banks
655
+ common_categories = {
656
  'Salary/Income': ['salary', 'wage', 'income', 'payroll', 'refund'],
657
  'Groceries': ['grocery', 'supermarket', 'food', 'spar', 'checkers', 'woolworths'],
658
  'Transport': ['fuel', 'petrol', 'uber', 'taxi', 'transport', 'car payment'],
 
662
  'Shopping': ['retail', 'clothing', 'amazon', 'takealot', 'mall'],
663
  'Investments': ['investment', 'shares', 'unit trust', 'retirement'],
664
  'Insurance': ['insurance', 'medical aid', 'life cover', 'short term'],
665
+ 'Bank Charges': ['fees', 'charge', 'service', 'cost', 'monthly account fee'],
666
+ 'Cash Transactions': ['atm', 'cash', 'withdrawal', 'deposit'],
 
 
 
667
  'Cellular': ['airtime', 'data', 'vodacom', 'mtn', 'cell c'],
668
  'Interest': ['interest'],
669
  'Failed Transactions': ['unsuccessful', 'declined', 'failed']
670
  }
671
 
672
+ # Bank-specific categories
673
+ bank_specific = {
674
+ 'FNB': {
675
+ 'POS Purchases': ['pos purchase', 'card purchase', 'debit order'],
676
+ 'Payments': ['payment to', 'fnb app rtc pmt', 'internet pmt', 'debit order']
677
+ },
678
+ 'Standard Bank': {
679
+ 'Payments': ['payment', 'transfer', 'debit order', 'immediate trf', 'digital payment']
680
+ },
681
+ 'Nedbank': {
682
+ 'POS Purchases': ['pos purchase', 'card purchase', 'debit order', 'cashsend'],
683
+ 'Payments': ['payment to', 'immediate trf', 'digital payment', 'pmt to'],
684
+ 'Loans': ['loan payment', 'nedloan', 'personal loan']
685
+ },
686
+ 'ABSA': {
687
+ 'POS Purchases': ['cashsend mobile', 'pos purchase'],
688
+ 'Payments': ['immediate trf', 'digital payment', 'payment']
689
+ }
690
+ }
691
+
692
+ # Combine common and bank-specific categories
693
+ category_keywords = {**common_categories, **(bank_specific.get(bank_name, {}))}
694
+
695
  # Check for specific keywords
696
  for category, keywords in category_keywords.items():
697
  if any(keyword in description_lower for keyword in keywords):
 
706
  except:
707
  pass
708
 
709
+ return 'Other'
710
 
711
  def create_advanced_visualizations(df, patterns, recommendations):
712
  """Create advanced interactive visualizations"""
 
713
  # 1. Financial Health Dashboard
714
  col1, col2, col3, col4 = st.columns(4)
715
 
 
781
 
782
  def main():
783
  """Main application function"""
 
784
  # Initialize database
785
  db = DatabaseManager()
786
 
787
  # Sidebar for user authentication
788
  with st.sidebar:
789
+ st.title("๐Ÿฆ Universal Bank Analyzer")
790
 
791
  if st.session_state.user_profile is None:
792
  tab1, tab2 = st.tabs(["Login", "Sign Up"])
 
832
  st.session_state.user_profile = None
833
  st.session_state.transactions_df = None
834
  st.session_state.analysis_complete = False
835
+ st.session_state.detected_bank = None
836
  st.experimental_rerun()
837
 
838
  # Main content area
839
  if st.session_state.user_profile is None:
840
  st.markdown("""
841
+ # ๐Ÿฆ Universal Bank Statement Analysis
842
+
843
+ ### Welcome to the next generation of financial analysis for all major banks!
844
 
845
+ **Supported Banks:**
846
+ - FNB (First National Bank)
847
+ - Standard Bank
848
+ - Nedbank
849
+ - ABSA
850
+ - Capitec Bank
851
 
852
  **Key Features:**
853
  - ๐Ÿค– **AI-Powered Insights**: Advanced machine learning for personalized recommendations
 
856
  - ๐Ÿ”ฎ **Predictive Analysis**: Forecast future spending trends
857
  - ๐Ÿ›ก๏ธ **Anomaly Detection**: Identify unusual transactions
858
  - ๐Ÿ’ก **Smart Recommendations**: Personalized financial advice
859
+ - ๐Ÿ’ฐ **Loan Eligibility**: Check your loan eligibility instantly
860
 
861
  **Please login or create an account to get started.**
862
  """)
 
863
  return
864
 
865
  # Main analysis interface
866
+ st.title(f"๐Ÿฆ Universal Bank Analysis Dashboard - {st.session_state.user_profile['name']}")
867
 
868
  # File upload section
869
  st.markdown("### ๐Ÿ“„ Upload Your Bank Statement")
870
  uploaded_file = st.file_uploader(
871
  "Choose a PDF bank statement",
872
  type="pdf",
873
+ help="Upload your bank statement in PDF format for analysis"
874
  )
875
 
876
  if uploaded_file is not None:
877
  try:
878
+ # Parse PDF and identify bank
879
+ with st.spinner("๐Ÿ” Analyzing bank statement..."):
880
+ text = extract_text_from_pdf(uploaded_file)
881
+ bank_name = identify_bank_from_text(text)
882
+
883
+ if not bank_name:
884
+ st.error("Could not identify bank from statement. Please ensure it's from a supported bank.")
885
+ return
886
+
887
+ st.session_state.detected_bank = bank_name
888
+ st.success(f"Detected Bank: {bank_name}")
889
+
890
+ # Extract metadata based on bank
891
+ metadata = extract_bank_statement_metadata(text, bank_name)
892
+
893
+ # Parse transactions
894
+ df = process_text_to_df_enhanced(text, bank_name)
895
 
896
  if df.empty:
897
  st.warning("โš ๏ธ No transactions found in the uploaded statement. Please check the file format.")
 
901
  st.session_state.transactions_df = df
902
 
903
  # Enhance data with categories
904
+ df['Category'] = df['Description'].apply(lambda x: categorize_expense_enhanced(x, bank_name))
905
 
906
  # Initialize analytics engines
907
  personalization = PersonalizationEngine(st.session_state.user_profile)
 
912
  patterns = personalization.analyze_spending_patterns(df)
913
  recommendations = personalization.generate_personalized_recommendations(df, patterns)
914
  health_score, score_components = personalization.calculate_financial_health_score(df, patterns)
915
+ loan_prediction = analytics.enhanced_loan_prediction(df, bank_name)
916
  df_with_anomalies = analytics.detect_anomalies(df)
917
 
918
  st.session_state.analysis_complete = True
919
 
920
+ # Display account info
921
+ if bank_name == 'FNB':
922
+ account_holder_name, closing_balance = metadata
923
+ st.success(f"Account Holder: {account_holder_name}")
924
+ st.info(f"Closing Balance: R{closing_balance:,.2f}")
925
+ else:
926
+ account_holder_name, account_number, statement_period, closing_balance = metadata
927
+ st.success(f"Account Holder: {account_holder_name}")
928
+ st.info(f"Account Number: {account_number} | Statement Period: {statement_period}")
929
+ st.info(f"Closing Balance: R{closing_balance:,.2f}")
930
+
931
  # Display results in tabs
932
  tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
933
  "๐Ÿ“Š Overview", "๐Ÿ’ก Recommendations", "๐Ÿฅ Health Score",
 
1125
 
1126
  except Exception as e:
1127
  st.error(f"โŒ An error occurred while processing your statement: {str(e)}")
1128
+ st.info("Please ensure your PDF is a valid bank statement and try again.")
1129
 
1130
  if __name__ == "__main__":
1131
  main()
 
1167
  color: white;
1168
  }
1169
  </style>
1170
+ """, unsafe_allow_html=True)