Spaces:
Running
Running
olmadı içerikleri çıkarmada zayıf daha iyi araçları ve kütüphaneleri kullan.
Browse files- index.html +19 -4
- script.js +476 -121
index.html
CHANGED
|
@@ -8,11 +8,26 @@
|
|
| 8 |
<script src="https://cdn.tailwindcss.com"></script>
|
| 9 |
<script src="https://cdn.jsdelivr.net/npm/feather-icons/dist/feather.min.js"></script>
|
| 10 |
<script src="https://unpkg.com/feather-icons"></script>
|
| 11 |
-
|
| 12 |
-
<script src="https://cdnjs.cloudflare.com/ajax/libs/
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.18.5/xlsx.full.min.js"></script>
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
</head>
|
| 17 |
<body class="bg-gray-50 min-h-screen">
|
| 18 |
<custom-navbar></custom-navbar>
|
|
|
|
| 8 |
<script src="https://cdn.tailwindcss.com"></script>
|
| 9 |
<script src="https://cdn.jsdelivr.net/npm/feather-icons/dist/feather.min.js"></script>
|
| 10 |
<script src="https://unpkg.com/feather-icons"></script>
|
| 11 |
+
<!-- Enhanced PDF Processing -->
|
| 12 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
|
| 13 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf-lib/1.17.1/pdf-lib.min.js"></script>
|
| 14 |
+
|
| 15 |
+
<!-- Enhanced Document Processing -->
|
| 16 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.6.0/mammoth.browser.min.js"></script>
|
| 17 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.18.5/xlsx.full.min.js"></script>
|
| 18 |
+
|
| 19 |
+
<!-- Enhanced OCR with Multiple Languages -->
|
| 20 |
+
<script src="https://cdn.jsdelivr.net/npm/tesseract.js@4/dist/tesseract.min.js"></script>
|
| 21 |
+
<script src="https://unpkg.com/[email protected]/dist/ocr-space-api.min.js"></script>
|
| 22 |
+
|
| 23 |
+
<!-- Image Processing -->
|
| 24 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/cropperjs/1.6.1/cropper.min.js"></script>
|
| 25 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/cropperjs/1.6.1/cropper.min.css">
|
| 26 |
+
|
| 27 |
+
<!-- Additional Language Support -->
|
| 28 |
+
<script src="https://cdn.jsdelivr.net/npm/[email protected]/he.js"></script>
|
| 29 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/iconv-lite/0.6.3/iconv-lite.min.js"></script>
|
| 30 |
+
<script src="script.js"></script>
|
| 31 |
</head>
|
| 32 |
<body class="bg-gray-50 min-h-screen">
|
| 33 |
<custom-navbar></custom-navbar>
|
script.js
CHANGED
|
@@ -11,11 +11,12 @@ document.addEventListener('DOMContentLoaded', function() {
|
|
| 11 |
|
| 12 |
let files = [];
|
| 13 |
let processedResults = [];
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
//
|
| 16 |
-
pdfjsLib.GlobalWorkerOptions.
|
| 17 |
-
|
| 18 |
-
// Handle file selection
|
| 19 |
uploadBtn.addEventListener('click', () => fileInput.click());
|
| 20 |
|
| 21 |
fileInput.addEventListener('change', handleFileSelection);
|
|
@@ -182,73 +183,100 @@ return {
|
|
| 182 |
};
|
| 183 |
}
|
| 184 |
async function extractTextFromPDF(file) {
|
| 185 |
-
return new Promise((resolve, reject) => {
|
| 186 |
const reader = new FileReader();
|
| 187 |
|
| 188 |
reader.onload = async function(event) {
|
| 189 |
try {
|
| 190 |
const typedArray = new Uint8Array(event.target.result);
|
|
|
|
|
|
|
| 191 |
const loadingTask = pdfjsLib.getDocument({
|
| 192 |
data: typedArray,
|
| 193 |
-
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@
|
| 194 |
cMapPacked: true,
|
| 195 |
-
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
});
|
| 197 |
|
| 198 |
const pdf = await loadingTask.promise;
|
| 199 |
let fullText = '';
|
|
|
|
| 200 |
|
|
|
|
| 201 |
for (let i = 1; i <= pdf.numPages; i++) {
|
| 202 |
const page = await pdf.getPage(i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
const textContent = await page.getTextContent({
|
| 204 |
normalizeWhitespace: false,
|
| 205 |
-
disableCombineTextItems: false
|
|
|
|
| 206 |
});
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
.replace(/\u00C3\u0087/g, 'Ç') // Ç
|
| 221 |
-
.replace(/\u00C3\u011F/g, 'ğ') // ğ
|
| 222 |
-
.replace(/\u00C4\u0178/g, 'Ğ') // Ğ
|
| 223 |
-
.replace(/\u00C3\u00BC/g, 'ü') // ü
|
| 224 |
-
.replace(/\u00C3\u009C/g, 'Ü') // Ü
|
| 225 |
-
.replace(/\u00C3\u015F/g, 'ş') // ş
|
| 226 |
-
.replace(/\u00C5\u0178/g, 'Ş') // Ş
|
| 227 |
-
.replace(/\u00C3\u0131/g, 'ı') // ı
|
| 228 |
-
.replace(/\u00C4\u0131/g, 'İ') // İ
|
| 229 |
-
.replace(/\u00C3\u00B6/g, 'ö') // ö
|
| 230 |
-
.replace(/\u00C3\u0096/g, 'Ö'); // Ö
|
| 231 |
-
if (!fullText.trim()) {
|
| 232 |
-
console.warn('PDF text extraction returned empty content. Attempting OCR processing...');
|
| 233 |
-
try {
|
| 234 |
-
// Convert PDF to image for OCR processing
|
| 235 |
-
const images = await convertPDFToImages(typedArray);
|
| 236 |
-
let ocrText = '';
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
}
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
}
|
| 248 |
}
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
resolve(fullText);
|
| 251 |
-
|
|
|
|
| 252 |
console.error('PDF extraction error:', error);
|
| 253 |
reject(new Error('Failed to extract text from PDF: ' + error.message));
|
| 254 |
}
|
|
@@ -258,122 +286,449 @@ return {
|
|
| 258 |
reader.readAsArrayBuffer(file);
|
| 259 |
});
|
| 260 |
}
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
|
| 273 |
-
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
}
|
| 277 |
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
result[sheetName] = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
|
| 291 |
-
});
|
| 292 |
-
|
| 293 |
-
resolve(result);
|
| 294 |
-
} catch (error) {
|
| 295 |
-
reject(error);
|
| 296 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
};
|
| 298 |
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
}
|
| 303 |
-
|
| 304 |
-
|
|
|
|
| 305 |
const loadingTask = pdfjsLib.getDocument({
|
| 306 |
data: pdfData,
|
| 307 |
-
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@
|
| 308 |
cMapPacked: true,
|
| 309 |
-
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@
|
| 310 |
});
|
| 311 |
|
| 312 |
const pdf = await loadingTask.promise;
|
| 313 |
const images = [];
|
| 314 |
|
| 315 |
-
|
|
|
|
| 316 |
const page = await pdf.getPage(i);
|
| 317 |
-
const viewport = page.getViewport({ scale:
|
|
|
|
| 318 |
const canvas = document.createElement('canvas');
|
| 319 |
const context = canvas.getContext('2d');
|
| 320 |
|
| 321 |
canvas.height = viewport.height;
|
| 322 |
canvas.width = viewport.width;
|
| 323 |
|
|
|
|
| 324 |
await page.render({
|
| 325 |
canvasContext: context,
|
| 326 |
-
viewport: viewport
|
|
|
|
|
|
|
| 327 |
}).promise;
|
| 328 |
|
| 329 |
images.push(canvas);
|
| 330 |
}
|
| 331 |
|
| 332 |
-
|
| 333 |
-
}
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
}
|
| 344 |
-
}
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
'
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
user_defined_words: Object.keys(trainedWords).join(' '),
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
load_freq_dawg: 1,
|
| 363 |
user_words_suffix: 'tur',
|
| 364 |
-
user_patterns_suffix: 'tur'
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
} else {
|
| 372 |
-
resolve(text);
|
| 373 |
}
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
});
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
// Apply learned corrections
|
| 378 |
if (window.ocrLearningDict) {
|
| 379 |
for (const [word, data] of Object.entries(window.ocrLearningDict)) {
|
|
|
|
| 11 |
|
| 12 |
let files = [];
|
| 13 |
let processedResults = [];
|
| 14 |
+
// Set enhanced PDF.js worker path with additional configurations
|
| 15 |
+
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
|
| 16 |
|
| 17 |
+
// Configure PDF.js for better text extraction
|
| 18 |
+
pdfjsLib.GlobalWorkerOptions.isEvalSupported = false;
|
| 19 |
+
// Handle file selection
|
|
|
|
| 20 |
uploadBtn.addEventListener('click', () => fileInput.click());
|
| 21 |
|
| 22 |
fileInput.addEventListener('change', handleFileSelection);
|
|
|
|
| 183 |
};
|
| 184 |
}
|
| 185 |
async function extractTextFromPDF(file) {
|
| 186 |
+
return new Promise(async (resolve, reject) => {
|
| 187 |
const reader = new FileReader();
|
| 188 |
|
| 189 |
reader.onload = async function(event) {
|
| 190 |
try {
|
| 191 |
const typedArray = new Uint8Array(event.target.result);
|
| 192 |
+
|
| 193 |
+
// Enhanced PDF loading with multiple extraction strategies
|
| 194 |
const loadingTask = pdfjsLib.getDocument({
|
| 195 |
data: typedArray,
|
| 196 |
+
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
|
| 197 |
cMapPacked: true,
|
| 198 |
+
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
|
| 199 |
+
useSystemFonts: true,
|
| 200 |
+
useWorkerFetch: true,
|
| 201 |
+
isEvalSupported: false,
|
| 202 |
+
disableAutoFetch: false,
|
| 203 |
+
disableStream: false
|
| 204 |
});
|
| 205 |
|
| 206 |
const pdf = await loadingTask.promise;
|
| 207 |
let fullText = '';
|
| 208 |
+
let metadata = await pdf.getMetadata();
|
| 209 |
|
| 210 |
+
// Strategy 1: Enhanced text extraction with structural analysis
|
| 211 |
for (let i = 1; i <= pdf.numPages; i++) {
|
| 212 |
const page = await pdf.getPage(i);
|
| 213 |
+
|
| 214 |
+
// Get viewport for better text positioning
|
| 215 |
+
const viewport = page.getViewport({ scale: 2.0 });
|
| 216 |
+
|
| 217 |
+
// Enhanced text content extraction
|
| 218 |
const textContent = await page.getTextContent({
|
| 219 |
normalizeWhitespace: false,
|
| 220 |
+
disableCombineTextItems: false,
|
| 221 |
+
includeMarkedContent: true
|
| 222 |
});
|
| 223 |
|
| 224 |
+
// Process text items with better grouping
|
| 225 |
+
const textItems = textContent.items;
|
| 226 |
+
let pageText = '';
|
| 227 |
+
let lastY = null;
|
| 228 |
+
let lastX = null;
|
| 229 |
|
| 230 |
+
for (let j = 0; j < textItems.length; j++) {
|
| 231 |
+
const item = textItems[j];
|
| 232 |
+
const tx = pdfjsLib.Util.transform(
|
| 233 |
+
viewport.transform,
|
| 234 |
+
item.transform
|
| 235 |
+
);
|
| 236 |
+
const x = tx[4];
|
| 237 |
+
const y = tx[5];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
+
// Add line breaks based on Y position
|
| 240 |
+
if (lastY !== null && Math.abs(y - lastY) > item.height * 0.8) {
|
| 241 |
+
pageText += '\n';
|
| 242 |
}
|
| 243 |
|
| 244 |
+
// Add spaces based on X position
|
| 245 |
+
if (lastX !== null && x - lastX > item.width * 0.3) {
|
| 246 |
+
pageText += ' ';
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
pageText += item.str;
|
| 250 |
+
lastY = y;
|
| 251 |
+
lastX = x + item.width;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
// Clean up and format the text
|
| 255 |
+
pageText = pageText
|
| 256 |
+
.replace(/\s+/g, ' ')
|
| 257 |
+
.replace(/\n\s*\n/g, '\n\n')
|
| 258 |
+
.trim();
|
| 259 |
+
|
| 260 |
+
if (pageText) {
|
| 261 |
+
fullText += pageText + '\n\n';
|
| 262 |
}
|
| 263 |
}
|
| 264 |
|
| 265 |
+
// Strategy 2: Enhanced Turkish character decoding
|
| 266 |
+
fullText = decodeTurkishText(fullText);
|
| 267 |
+
|
| 268 |
+
// Strategy 3: If still poor quality, try OCR with preprocessing
|
| 269 |
+
if (!fullText.trim() || fullText.trim().length < 50) {
|
| 270 |
+
console.warn('Primary text extraction failed, attempting enhanced OCR...');
|
| 271 |
+
fullText = await enhancedOCRFallback(typedArray);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
// Strategy 4: Apply text quality improvements
|
| 275 |
+
fullText = improveTextQuality(fullText);
|
| 276 |
+
|
| 277 |
resolve(fullText);
|
| 278 |
+
|
| 279 |
+
} catch (error) {
|
| 280 |
console.error('PDF extraction error:', error);
|
| 281 |
reject(new Error('Failed to extract text from PDF: ' + error.message));
|
| 282 |
}
|
|
|
|
| 286 |
reader.readAsArrayBuffer(file);
|
| 287 |
});
|
| 288 |
}
|
| 289 |
+
|
| 290 |
+
// Enhanced Turkish text decoding
|
| 291 |
+
function decodeTurkishText(text) {
|
| 292 |
+
// Multiple encoding fixes for Turkish characters
|
| 293 |
+
const fixes = [
|
| 294 |
+
// UTF-8 double encoding
|
| 295 |
+
[/\u00C3\u00A7/g, 'ç'], [/\u00C3\u0087/g, 'Ç'],
|
| 296 |
+
[/\u00C3\u011F/g, 'ğ'], [/\u00C4\u0178/g, 'Ğ'],
|
| 297 |
+
[/\u00C3\u00BC/g, 'ü'], [/\u00C3\u009C/g, 'Ü'],
|
| 298 |
+
[/\u00C3\u015F/g, 'ş'], [/\u00C5\u0178/g, 'Ş'],
|
| 299 |
+
[/\u00C3\u0131/g, 'ı'], [/\u00C4\u0131/g, 'İ'],
|
| 300 |
+
[/\u00C3\u00B6/g, 'ö'], [/\u00C3\u0096/g, 'Ö'],
|
| 301 |
|
| 302 |
+
// ISO-8859-9 to UTF-8
|
| 303 |
+
[/[\u00C4\u00E4]/g, 'ä'], [/[\u00C5\u00E5]/g, 'å'],
|
| 304 |
+
[/[\u00C6\u00E6]/g, 'æ'], [/[\u00C7\u00E7]/g, 'ç'],
|
| 305 |
+
[/[\u00D0\u00F0]/g, 'ð'], [/[\u011E\u011F]/g, 'ğ'],
|
| 306 |
+
[/[\u0130\u0131]/g, 'ı'], [/[\u015E\u015F]/g, 'ş'],
|
| 307 |
+
[/[\u00D6\u00F6]/g, 'ö'], [/[\u00DC\u00FC]/g, 'ü'],
|
| 308 |
+
[/[\u00DE\u00FE]/g, 'þ'],
|
| 309 |
|
| 310 |
+
// Common OCR errors
|
| 311 |
+
[/c/g, 'ç', { context: 'turkish' }], [/C/g, 'Ç', { context: 'turkish' }],
|
| 312 |
+
[/g/g, 'ğ', { context: 'turkish' }], [/G/g, 'Ğ', { context: 'turkish' }],
|
| 313 |
+
[/i/g, 'ı', { context: 'turkish' }], [/I/g, 'İ', { context: 'turkish' }],
|
| 314 |
+
[/o/g, 'ö', { context: 'turkish' }], [/O/g, 'Ö', { context: 'turkish' }],
|
| 315 |
+
[/s/g, 'ş', { context: 'turkish' }], [/S/g, 'Ş', { context: 'turkish' }],
|
| 316 |
+
[/u/g, 'ü', { context: 'turkish' }], [/U/g, 'Ü', { context: 'turkish' }]
|
| 317 |
+
];
|
| 318 |
+
|
| 319 |
+
let decodedText = text;
|
| 320 |
+
fixes.forEach(fix => {
|
| 321 |
+
if (Array.isArray(fix) && fix.length === 2) {
|
| 322 |
+
decodedText = decodedText.replace(fix[0], fix[1]);
|
| 323 |
+
}
|
| 324 |
});
|
| 325 |
+
|
| 326 |
+
// Apply HTML entity decoding if needed
|
| 327 |
+
try {
|
| 328 |
+
decodedText = he.decode(decodedText);
|
| 329 |
+
} catch (e) {
|
| 330 |
+
console.warn('HTML decoding failed:', e);
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
return decodedText;
|
| 334 |
}
|
| 335 |
|
| 336 |
+
// Enhanced OCR fallback with multiple engines
|
| 337 |
+
async function enhancedOCRFallback(pdfData) {
|
| 338 |
+
const images = await convertPDFToImagesEnhanced(pdfData);
|
| 339 |
+
let ocrResults = [];
|
| 340 |
+
|
| 341 |
+
for (const image of images) {
|
| 342 |
+
// Try multiple OCR approaches
|
| 343 |
+
const results = await Promise.allSettled([
|
| 344 |
+
// Tesseract with Turkish and English
|
| 345 |
+
extractTextWithTesseract(image, 'tur+eng'),
|
| 346 |
+
// Tesseract with additional preprocessing
|
| 347 |
+
extractTextWithTesseract(image, 'tur+eng', { preprocess: true }),
|
| 348 |
+
// Fallback to English only if Turkish fails
|
| 349 |
+
extractTextWithTesseract(image, 'eng')
|
| 350 |
+
]);
|
| 351 |
|
| 352 |
+
// Find the best result
|
| 353 |
+
let bestResult = '';
|
| 354 |
+
let maxLength = 0;
|
| 355 |
+
|
| 356 |
+
results.forEach(result => {
|
| 357 |
+
if (result.status === 'fulfilled' && result.value.length > maxLength) {
|
| 358 |
+
bestResult = result.value;
|
| 359 |
+
maxLength = result.value.length;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
}
|
| 361 |
+
});
|
| 362 |
+
|
| 363 |
+
if (bestResult) {
|
| 364 |
+
ocrResults.push(bestResult);
|
| 365 |
+
}
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
return ocrResults.join('\n\n') || 'OCR processing completed but no text was extracted.';
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
// Enhanced Tesseract extraction
|
| 372 |
+
async function extractTextWithTesseract(image, languages = 'tur+eng', options = {}) {
|
| 373 |
+
try {
|
| 374 |
+
const config = {
|
| 375 |
+
logger: m => console.log(`Tesseract: ${m.status} - ${Math.round(m.progress * 100)}%`),
|
| 376 |
+
preserve_interword_spaces: '1',
|
| 377 |
+
tessedit_pageseg_mode: '6',
|
| 378 |
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ@#$%^&*+=<>:;_ ',
|
| 379 |
+
load_system_dawg: '1',
|
| 380 |
+
load_freq_dawg: '1'
|
| 381 |
};
|
| 382 |
|
| 383 |
+
if (options.preprocess) {
|
| 384 |
+
// Apply image preprocessing
|
| 385 |
+
image = await preprocessImage(image);
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
const result = await Tesseract.recognize(image, languages, config);
|
| 389 |
+
return result.data.text;
|
| 390 |
+
} catch (error) {
|
| 391 |
+
console.error('Tesseract OCR error:', error);
|
| 392 |
+
throw error;
|
| 393 |
+
}
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
// Image preprocessing for better OCR
|
| 397 |
+
async function preprocessImage(canvas) {
|
| 398 |
+
const ctx = canvas.getContext('2d');
|
| 399 |
+
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
|
| 400 |
+
const data = imageData.data;
|
| 401 |
+
|
| 402 |
+
// Convert to grayscale
|
| 403 |
+
for (let i = 0; i < data.length; i += 4) {
|
| 404 |
+
const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
|
| 405 |
+
data[i] = gray;
|
| 406 |
+
data[i + 1] = gray;
|
| 407 |
+
data[i + 2] = gray;
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
// Apply adaptive thresholding
|
| 411 |
+
const threshold = 128;
|
| 412 |
+
for (let i = 0; i < data.length; i += 4) {
|
| 413 |
+
const value = data[i] > threshold ? 255 : 0;
|
| 414 |
+
data[i] = value;
|
| 415 |
+
data[i + 1] = value;
|
| 416 |
+
data[i + 2] = value;
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
ctx.putImageData(imageData, 0, 0);
|
| 420 |
+
return canvas;
|
| 421 |
}
|
| 422 |
+
|
| 423 |
+
// Enhanced PDF to image conversion
|
| 424 |
+
async function convertPDFToImagesEnhanced(pdfData) {
|
| 425 |
const loadingTask = pdfjsLib.getDocument({
|
| 426 |
data: pdfData,
|
| 427 |
+
cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
|
| 428 |
cMapPacked: true,
|
| 429 |
+
standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/'
|
| 430 |
});
|
| 431 |
|
| 432 |
const pdf = await loadingTask.promise;
|
| 433 |
const images = [];
|
| 434 |
|
| 435 |
+
// Process all pages with higher resolution
|
| 436 |
+
for (let i = 1; i <= Math.min(pdf.numPages, 10); i++) {
|
| 437 |
const page = await pdf.getPage(i);
|
| 438 |
+
const viewport = page.getViewport({ scale: 3.0 });
|
| 439 |
+
|
| 440 |
const canvas = document.createElement('canvas');
|
| 441 |
const context = canvas.getContext('2d');
|
| 442 |
|
| 443 |
canvas.height = viewport.height;
|
| 444 |
canvas.width = viewport.width;
|
| 445 |
|
| 446 |
+
// Render with better quality
|
| 447 |
await page.render({
|
| 448 |
canvasContext: context,
|
| 449 |
+
viewport: viewport,
|
| 450 |
+
renderInteractiveForms: true,
|
| 451 |
+
intent: 'print'
|
| 452 |
}).promise;
|
| 453 |
|
| 454 |
images.push(canvas);
|
| 455 |
}
|
| 456 |
|
| 457 |
+
return images;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
// Text quality improvement
|
| 461 |
+
function improveTextQuality(text) {
|
| 462 |
+
return text
|
| 463 |
+
// Fix common OCR errors in Turkish
|
| 464 |
+
.replace(/\bi\b/g, 'ı') // Turkish dotless i
|
| 465 |
+
.replace(/\bI\b/g, 'İ') // Turkish capital I with dot
|
| 466 |
+
.replace(/c([aeiou])/gi, 'ç$1') // c followed by vowel -> ç
|
| 467 |
+
.replace(/C([AEIOU])/g, 'Ç$1')
|
| 468 |
+
.replace(/g([aeiou])/gi, 'ğ$1') // g followed by vowel -> ğ
|
| 469 |
+
.replace(/G([AEIOU])/g, 'Ğ$1')
|
| 470 |
+
.replace(/s([aeiou])/gi, 'ş$1') // s followed by vowel -> ş
|
| 471 |
+
.replace(/S([AEIOU])/g, 'Ş$1')
|
| 472 |
+
.replace(/o([aeiou])/gi, 'ö$1') // o followed by vowel -> ö
|
| 473 |
+
.replace(/O([AEIOU])/g, 'Ö$1')
|
| 474 |
+
.replace(/u([aeiou])/gi, 'ü$1') // u followed by vowel -> ü
|
| 475 |
+
.replace(/U([AEIOU])/g, 'Ü$1')
|
| 476 |
+
|
| 477 |
+
// Clean up spacing
|
| 478 |
+
.replace(/\s+/g, ' ')
|
| 479 |
+
.replace(/\n\s*\n/g, '\n\n')
|
| 480 |
+
|
| 481 |
+
// Fix common character confusion
|
| 482 |
+
.replace(/0/g, 'O', { condition: context => /[A-Z]/.test(context.after) })
|
| 483 |
+
.replace(/1/g, 'I', { condition: context => /[A-Z]/.test(context.after) })
|
| 484 |
+
.replace(/5/g, 'S', { condition: context => /[A-Z]/.test(context.after) })
|
| 485 |
+
.trim();
|
| 486 |
+
}
|
| 487 |
+
async function extractTextFromWord(file) {
|
| 488 |
+
return new Promise(async (resolve, reject) => {
|
| 489 |
+
const reader = new FileReader();
|
| 490 |
+
|
| 491 |
+
reader.onload = async function(event) {
|
| 492 |
+
try {
|
| 493 |
+
// Enhanced Word document extraction
|
| 494 |
+
const result = await mammoth.extractRawText({
|
| 495 |
+
arrayBuffer: event.target.result,
|
| 496 |
+
options: {
|
| 497 |
+
includeDefaultStyleMap: true,
|
| 498 |
+
styleMap: [
|
| 499 |
+
"p[style-name='Heading 1'] => h1:fresh",
|
| 500 |
+
"p[style-name='Heading 2'] => h2:fresh",
|
| 501 |
+
"p[style-name='Heading 3'] => h3:fresh",
|
| 502 |
+
"p[style-name='Title'] => h1.title:fresh",
|
| 503 |
+
"r[style-name='Strong'] => strong",
|
| 504 |
+
"r[style-name='Emphasis'] => em"
|
| 505 |
+
]
|
| 506 |
+
}
|
| 507 |
+
});
|
| 508 |
+
|
| 509 |
+
let text = result.value;
|
| 510 |
+
|
| 511 |
+
// Apply Turkish character decoding
|
| 512 |
+
text = decodeTurkishText(text);
|
| 513 |
+
|
| 514 |
+
// Apply text quality improvements
|
| 515 |
+
text = improveTextQuality(text);
|
| 516 |
+
|
| 517 |
+
// Try alternative extraction if result is poor
|
| 518 |
+
if (text.trim().length < 50) {
|
| 519 |
+
console.warn('Primary Word extraction failed, trying alternative...');
|
| 520 |
+
const altResult = await mammoth.convertToMarkdown({
|
| 521 |
+
arrayBuffer: event.target.result
|
| 522 |
+
});
|
| 523 |
+
|
| 524 |
+
if (altResult.value && altResult.value.trim().length > text.trim().length) {
|
| 525 |
+
text = altResult.value;
|
| 526 |
+
text = decodeTurkishText(text);
|
| 527 |
+
text = improveTextQuality(text);
|
| 528 |
+
}
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
resolve(text);
|
| 532 |
+
} catch (error) {
|
| 533 |
+
reject(error);
|
| 534 |
}
|
| 535 |
+
};
|
| 536 |
+
|
| 537 |
+
reader.onerror = reject;
|
| 538 |
+
reader.readAsArrayBuffer(file);
|
| 539 |
+
});
|
| 540 |
+
}
|
| 541 |
+
async function extractTextFromExcel(file) {
|
| 542 |
+
return new Promise(async (resolve, reject) => {
|
| 543 |
+
const reader = new FileReader();
|
| 544 |
+
|
| 545 |
+
reader.onload = async function(event) {
|
| 546 |
+
try {
|
| 547 |
+
const data = new Uint8Array(event.target.result);
|
| 548 |
+
|
| 549 |
+
// Enhanced Excel reading with Turkish support
|
| 550 |
+
const workbook = XLSX.read(data, {
|
| 551 |
+
type: 'array',
|
| 552 |
+
codepage: 1254, // Turkish codepage
|
| 553 |
+
cellStyles: true,
|
| 554 |
+
cellHTML: false
|
| 555 |
+
});
|
| 556 |
+
|
| 557 |
+
const result = {};
|
| 558 |
+
|
| 559 |
+
workbook.SheetNames.forEach(sheetName => {
|
| 560 |
+
const worksheet = workbook.Sheets[sheetName];
|
| 561 |
+
|
| 562 |
+
// Try multiple extraction methods
|
| 563 |
+
const jsonData = XLSX.utils.sheet_to_json(worksheet, {
|
| 564 |
+
header: 1,
|
| 565 |
+
raw: false,
|
| 566 |
+
dateNF: 'dd/mm/yyyy',
|
| 567 |
+
defval: ''
|
| 568 |
+
});
|
| 569 |
|
| 570 |
+
const csvData = XLSX.utils.sheet_to_csv(worksheet, {
|
| 571 |
+
FS: '\t',
|
| 572 |
+
RS: '\n',
|
| 573 |
+
dateNF: 'dd/mm/yyyy'
|
| 574 |
+
});
|
| 575 |
+
|
| 576 |
+
// Process data with Turkish character support
|
| 577 |
+
const processedData = jsonData.map(row =>
|
| 578 |
+
row.map(cell => {
|
| 579 |
+
if (typeof cell === 'string') {
|
| 580 |
+
return decodeTurkishText(improveTextQuality(cell));
|
| 581 |
+
}
|
| 582 |
+
return cell;
|
| 583 |
+
})
|
| 584 |
+
);
|
| 585 |
+
|
| 586 |
+
result[sheetName] = {
|
| 587 |
+
data: processedData,
|
| 588 |
+
csv: decodeTurkishText(csvData),
|
| 589 |
+
range: worksheet['!ref'] || '',
|
| 590 |
+
rowCount: jsonData.length,
|
| 591 |
+
colCount: jsonData[0] ? jsonData[0].length : 0
|
| 592 |
+
};
|
| 593 |
+
});
|
| 594 |
+
|
| 595 |
+
resolve(result);
|
| 596 |
+
} catch (error) {
|
| 597 |
+
reject(error);
|
| 598 |
+
}
|
| 599 |
+
};
|
| 600 |
+
|
| 601 |
+
reader.onerror = reject;
|
| 602 |
+
reader.readAsArrayBuffer(file);
|
| 603 |
+
});
|
| 604 |
+
}
|
| 605 |
+
async function convertPDFToImages(pdfData) {
|
| 606 |
+
return await convertPDFToImagesEnhanced(pdfData);
|
| 607 |
+
}
|
| 608 |
+
async function extractTextFromImage(file) {
|
| 609 |
+
return new Promise(async (resolve, reject) => {
|
| 610 |
+
try {
|
| 611 |
+
// Apply learned corrections before OCR
|
| 612 |
+
let trainedWords = {};
|
| 613 |
+
if (window.ocrLearningDict) {
|
| 614 |
+
for (const [word, data] of Object.entries(window.ocrLearningDict)) {
|
| 615 |
+
if (data.confirmedCorrect && data.confirmedCorrect !== word) {
|
| 616 |
+
trainedWords[word] = data.confirmedCorrect;
|
| 617 |
+
}
|
| 618 |
+
}
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
const imageElement = file instanceof HTMLCanvasElement ? file : file;
|
| 622 |
+
|
| 623 |
+
// Enhanced OCR configuration
|
| 624 |
+
const config = {
|
| 625 |
+
logger: m => {
|
| 626 |
+
if (m.status === 'recognizing text') {
|
| 627 |
+
console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`);
|
| 628 |
+
}
|
| 629 |
+
},
|
| 630 |
+
preserve_interword_spaces: '1',
|
| 631 |
+
tessedit_pageseg_mode: '6', // Assume uniform text block
|
| 632 |
+
tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
|
| 633 |
user_defined_words: Object.keys(trainedWords).join(' '),
|
| 634 |
+
tessedit_create_hocr: '1',
|
| 635 |
+
load_system_dawg: '1',
|
| 636 |
+
load_freq_dawg: '1',
|
|
|
|
| 637 |
user_words_suffix: 'tur',
|
| 638 |
+
user_patterns_suffix: 'tur',
|
| 639 |
+
tessedit_ocr_engine_mode: '1', // LSTM OCR engine
|
| 640 |
+
tessedit_do_ocr: '1',
|
| 641 |
+
tessedit_load_image: '1'
|
| 642 |
+
};
|
| 643 |
+
|
| 644 |
+
// Try multiple OCR approaches
|
| 645 |
+
const results = await Promise.allSettled([
|
| 646 |
+
// Primary: Turkish + English with enhanced preprocessing
|
| 647 |
+
performOCRWithPreprocessing(imageElement, 'tur+eng', config),
|
| 648 |
+
// Secondary: Different page segmentation
|
| 649 |
+
Tesseract.recognize(imageElement, 'tur+eng', {
|
| 650 |
+
...config,
|
| 651 |
+
tessedit_pageseg_mode: '1' // Automatic page segmentation
|
| 652 |
+
}),
|
| 653 |
+
// Tertiary: Only English if Turkish fails
|
| 654 |
+
Tesseract.recognize(imageElement, 'eng', config)
|
| 655 |
+
]);
|
| 656 |
+
|
| 657 |
+
// Find and return the best result
|
| 658 |
+
let bestResult = { text: '', confidence: 0 };
|
| 659 |
+
|
| 660 |
+
results.forEach(result => {
|
| 661 |
+
if (result.status === 'fulfilled') {
|
| 662 |
+
const text = result.value.text;
|
| 663 |
+
const confidence = calculateConfidence(text);
|
| 664 |
+
|
| 665 |
+
if (text.trim().length > bestResult.text.length ||
|
| 666 |
+
(text.trim().length === bestResult.text.length && confidence > bestResult.confidence)) {
|
| 667 |
+
bestResult = { text, confidence };
|
| 668 |
+
}
|
| 669 |
+
}
|
| 670 |
+
});
|
| 671 |
+
|
| 672 |
+
if (bestResult.text) {
|
| 673 |
+
// Apply text quality improvements
|
| 674 |
+
bestResult.text = decodeTurkishText(bestResult.text);
|
| 675 |
+
bestResult.text = improveTextQuality(bestResult.text);
|
| 676 |
+
|
| 677 |
+
if (outputFormat.value === 'formatted') {
|
| 678 |
+
// Create formatted output
|
| 679 |
+
const formatted = createFormattedText(bestResult.text);
|
| 680 |
+
resolve(formatted);
|
| 681 |
+
} else {
|
| 682 |
+
resolve(bestResult.text);
|
| 683 |
+
}
|
| 684 |
} else {
|
| 685 |
+
resolve('No text could be extracted from the image.');
|
| 686 |
}
|
| 687 |
+
|
| 688 |
+
} catch (error) {
|
| 689 |
+
console.error('Enhanced image OCR error:', error);
|
| 690 |
+
reject(error);
|
| 691 |
+
}
|
| 692 |
});
|
| 693 |
+
|
| 694 |
+
// OCR with image preprocessing
|
| 695 |
+
async function performOCRWithPreprocessing(image, languages, config) {
|
| 696 |
+
let processedImage = image;
|
| 697 |
+
|
| 698 |
+
if (image instanceof HTMLCanvasElement) {
|
| 699 |
+
// Apply preprocessing to canvas
|
| 700 |
+
processedImage = await preprocessImage(image);
|
| 701 |
+
}
|
| 702 |
+
|
| 703 |
+
return await Tesseract.recognize(processedImage, languages, config);
|
| 704 |
+
}
|
| 705 |
+
|
| 706 |
+
// Calculate text confidence score
|
| 707 |
+
function calculateConfidence(text) {
|
| 708 |
+
if (!text || text.trim().length === 0) return 0;
|
| 709 |
+
|
| 710 |
+
// Score based on Turkish word detection
|
| 711 |
+
const turkishWords = text.match(/[ğüşıöçĞÜŞİÖÇ]+/g) || [];
|
| 712 |
+
const wordCount = text.split(/\s+/).length;
|
| 713 |
+
const turkishRatio = turkishWords.length / wordCount;
|
| 714 |
+
|
| 715 |
+
// Score based on sentence structure
|
| 716 |
+
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
|
| 717 |
+
const avgSentenceLength = sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length;
|
| 718 |
+
|
| 719 |
+
// Combined confidence score
|
| 720 |
+
return (turkishRatio * 0.5) + (Math.min(avgSentenceLength / 10, 1) * 0.5);
|
| 721 |
+
}
|
| 722 |
+
|
| 723 |
+
// Create formatted text output
|
| 724 |
+
function createFormattedText(text) {
|
| 725 |
+
return text
|
| 726 |
+
.replace(/([.!?])\s+/g, '$1\n\n') // Better paragraph breaks
|
| 727 |
+
.replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
|
| 728 |
+
.replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 ') // Preserve Turkish words
|
| 729 |
+
.trim();
|
| 730 |
+
}
|
| 731 |
+
function processFormattedOCR(hocr) {
|
| 732 |
// Apply learned corrections
|
| 733 |
if (window.ocrLearningDict) {
|
| 734 |
for (const [word, data] of Object.entries(window.ocrLearningDict)) {
|