ASDAD34 commited on
Commit
e010252
·
verified ·
1 Parent(s): 262d37a

olmadı içerikleri çıkarmada zayıf daha iyi araçları ve kütüphaneleri kullan.

Browse files
Files changed (2) hide show
  1. index.html +19 -4
  2. script.js +476 -121
index.html CHANGED
@@ -8,11 +8,26 @@
8
  <script src="https://cdn.tailwindcss.com"></script>
9
  <script src="https://cdn.jsdelivr.net/npm/feather-icons/dist/feather.min.js"></script>
10
  <script src="https://unpkg.com/feather-icons"></script>
11
- <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.11.338/pdf.min.js"></script>
12
- <script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.4.0/mammoth.browser.min.js"></script>
 
 
 
 
13
  <script src="https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.18.5/xlsx.full.min.js"></script>
14
- <script src="https://cdnjs.cloudflare.com/ajax/libs/tesseract.js/4.1.1/tesseract.min.js"></script>
15
- <script src="script.js"></script>
 
 
 
 
 
 
 
 
 
 
 
16
  </head>
17
  <body class="bg-gray-50 min-h-screen">
18
  <custom-navbar></custom-navbar>
 
8
  <script src="https://cdn.tailwindcss.com"></script>
9
  <script src="https://cdn.jsdelivr.net/npm/feather-icons/dist/feather.min.js"></script>
10
  <script src="https://unpkg.com/feather-icons"></script>
11
+ <!-- Enhanced PDF Processing -->
12
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
13
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf-lib/1.17.1/pdf-lib.min.js"></script>
14
+
15
+ <!-- Enhanced Document Processing -->
16
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.6.0/mammoth.browser.min.js"></script>
17
  <script src="https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.18.5/xlsx.full.min.js"></script>
18
+
19
+ <!-- Enhanced OCR with Multiple Languages -->
20
+ <script src="https://cdn.jsdelivr.net/npm/tesseract.js@4/dist/tesseract.min.js"></script>
21
+ <script src="https://unpkg.com/[email protected]/dist/ocr-space-api.min.js"></script>
22
+
23
+ <!-- Image Processing -->
24
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/cropperjs/1.6.1/cropper.min.js"></script>
25
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/cropperjs/1.6.1/cropper.min.css">
26
+
27
+ <!-- Additional Language Support -->
28
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/he.js"></script>
29
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/iconv-lite/0.6.3/iconv-lite.min.js"></script>
30
+ <script src="script.js"></script>
31
  </head>
32
  <body class="bg-gray-50 min-h-screen">
33
  <custom-navbar></custom-navbar>
script.js CHANGED
@@ -11,11 +11,12 @@ document.addEventListener('DOMContentLoaded', function() {
11
 
12
  let files = [];
13
  let processedResults = [];
 
 
14
 
15
- // Set PDF.js worker path
16
- pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.11.338/pdf.worker.min.js';
17
-
18
- // Handle file selection
19
  uploadBtn.addEventListener('click', () => fileInput.click());
20
 
21
  fileInput.addEventListener('change', handleFileSelection);
@@ -182,73 +183,100 @@ return {
182
  };
183
  }
184
  async function extractTextFromPDF(file) {
185
- return new Promise((resolve, reject) => {
186
  const reader = new FileReader();
187
 
188
  reader.onload = async function(event) {
189
  try {
190
  const typedArray = new Uint8Array(event.target.result);
 
 
191
  const loadingTask = pdfjsLib.getDocument({
192
  data: typedArray,
193
- cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.11.338/cmaps/',
194
  cMapPacked: true,
195
- standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.11.338/standard_fonts/'
 
 
 
 
 
196
  });
197
 
198
  const pdf = await loadingTask.promise;
199
  let fullText = '';
 
200
 
 
201
  for (let i = 1; i <= pdf.numPages; i++) {
202
  const page = await pdf.getPage(i);
 
 
 
 
 
203
  const textContent = await page.getTextContent({
204
  normalizeWhitespace: false,
205
- disableCombineTextItems: false
 
206
  });
207
 
208
- const pageText = textContent.items
209
- .map(item => item.str)
210
- .join(' ');
 
 
211
 
212
- if (pageText.trim()) {
213
- fullText += pageText + '\n\n';
214
- }
215
- }
216
-
217
- // Try to fix common Turkish character encoding issues
218
- fullText = fullText
219
- .replace(/\u00C3\u00A7/g, 'ç') // ç
220
- .replace(/\u00C3\u0087/g, 'Ç') // Ç
221
- .replace(/\u00C3\u011F/g, 'ğ') // ğ
222
- .replace(/\u00C4\u0178/g, 'Ğ') // Ğ
223
- .replace(/\u00C3\u00BC/g, 'ü') // ü
224
- .replace(/\u00C3\u009C/g, 'Ü') // Ü
225
- .replace(/\u00C3\u015F/g, 'ş') // ş
226
- .replace(/\u00C5\u0178/g, 'Ş') // Ş
227
- .replace(/\u00C3\u0131/g, 'ı') // ı
228
- .replace(/\u00C4\u0131/g, 'İ') // İ
229
- .replace(/\u00C3\u00B6/g, 'ö') // ö
230
- .replace(/\u00C3\u0096/g, 'Ö'); // Ö
231
- if (!fullText.trim()) {
232
- console.warn('PDF text extraction returned empty content. Attempting OCR processing...');
233
- try {
234
- // Convert PDF to image for OCR processing
235
- const images = await convertPDFToImages(typedArray);
236
- let ocrText = '';
237
 
238
- for (const image of images) {
239
- const text = await extractTextFromImage(image);
240
- ocrText += text + '\n\n';
241
  }
242
 
243
- fullText = ocrText || 'OCR processing attempted but no text was found.';
244
- } catch (ocrError) {
245
- console.error('OCR processing failed:', ocrError);
246
- fullText = 'Warning: No extractable text found. OCR processing also failed: ' + ocrError.message;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  }
248
  }
249
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  resolve(fullText);
251
- } catch (error) {
 
252
  console.error('PDF extraction error:', error);
253
  reject(new Error('Failed to extract text from PDF: ' + error.message));
254
  }
@@ -258,122 +286,449 @@ return {
258
  reader.readAsArrayBuffer(file);
259
  });
260
  }
261
- async function extractTextFromWord(file) {
262
- return new Promise((resolve, reject) => {
263
- const reader = new FileReader();
 
 
 
 
 
 
 
 
 
264
 
265
- reader.onload = function(event) {
266
- mammoth.extractRawText({ arrayBuffer: event.target.result })
267
- .then(function(result) {
268
- resolve(result.value);
269
- })
270
- .catch(reject);
271
- };
272
 
273
- reader.onerror = reject;
274
- reader.readAsArrayBuffer(file);
 
 
 
 
 
 
 
 
 
 
 
 
275
  });
 
 
 
 
 
 
 
 
 
276
  }
277
 
278
- async function extractTextFromExcel(file) {
279
- return new Promise((resolve, reject) => {
280
- const reader = new FileReader();
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- reader.onload = function(event) {
283
- try {
284
- const data = new Uint8Array(event.target.result);
285
- const workbook = XLSX.read(data, { type: 'array' });
286
- const result = {};
287
-
288
- workbook.SheetNames.forEach(sheetName => {
289
- const worksheet = workbook.Sheets[sheetName];
290
- result[sheetName] = XLSX.utils.sheet_to_json(worksheet, { header: 1 });
291
- });
292
-
293
- resolve(result);
294
- } catch (error) {
295
- reject(error);
296
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  };
298
 
299
- reader.onerror = reject;
300
- reader.readAsArrayBuffer(file);
301
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  }
303
- async function convertPDFToImages(pdfData) {
304
- return new Promise(async (resolve) => {
 
305
  const loadingTask = pdfjsLib.getDocument({
306
  data: pdfData,
307
- cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.11.338/cmaps/',
308
  cMapPacked: true,
309
- standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@2.11.338/standard_fonts/'
310
  });
311
 
312
  const pdf = await loadingTask.promise;
313
  const images = [];
314
 
315
- for (let i = 1; i <= Math.min(pdf.numPages, 5); i++) { // Limit to 5 pages
 
316
  const page = await pdf.getPage(i);
317
- const viewport = page.getViewport({ scale: 1.5 });
 
318
  const canvas = document.createElement('canvas');
319
  const context = canvas.getContext('2d');
320
 
321
  canvas.height = viewport.height;
322
  canvas.width = viewport.width;
323
 
 
324
  await page.render({
325
  canvasContext: context,
326
- viewport: viewport
 
 
327
  }).promise;
328
 
329
  images.push(canvas);
330
  }
331
 
332
- resolve(images);
333
- });
334
- }
335
- async function extractTextFromImage(file) {
336
- return new Promise((resolve, reject) => {
337
- // Apply learned corrections before OCR if any exist
338
- let trainedWords = {};
339
- if (window.ocrLearningDict) {
340
- for (const [word, data] of Object.entries(window.ocrLearningDict)) {
341
- if (data.confirmedCorrect && data.confirmedCorrect !== word) {
342
- trainedWords[word] = data.confirmedCorrect;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  }
344
- }
345
- }
346
- const imageElement = file instanceof HTMLCanvasElement ?
347
- file :
348
- file;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- Tesseract.recognize(
351
- imageElement,
352
- 'tur+eng', // Turkish + English languages
353
- {
354
- logger: m => console.log(m),
355
- preserve_interword_spaces: true,
356
- tessedit_pageseg_mode: 6, // Assume a single uniform block of text
357
- tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ', // Added Turkish chars
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  user_defined_words: Object.keys(trainedWords).join(' '),
359
- user_words: Object.values(trainedWords).join(' '),
360
- tessedit_create_hocr: 1, // Include formatting info
361
- load_system_dawg: 1,
362
- load_freq_dawg: 1,
363
  user_words_suffix: 'tur',
364
- user_patterns_suffix: 'tur'
365
- }
366
- ).then(({ data: { text, hocr } }) => {
367
- if (outputFormat.value === 'formatted') {
368
- // Process formatted output similar to Adobe/Abbyy
369
- const formatted = processFormattedOCR(hocr);
370
- resolve(formatted);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  } else {
372
- resolve(text);
373
  }
374
- }).catch(reject);
 
 
 
 
375
  });
376
- function processFormattedOCR(hocr) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  // Apply learned corrections
378
  if (window.ocrLearningDict) {
379
  for (const [word, data] of Object.entries(window.ocrLearningDict)) {
 
11
 
12
  let files = [];
13
  let processedResults = [];
14
+ // Set enhanced PDF.js worker path with additional configurations
15
+ pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
16
 
17
+ // Configure PDF.js for better text extraction
18
+ pdfjsLib.GlobalWorkerOptions.isEvalSupported = false;
19
+ // Handle file selection
 
20
  uploadBtn.addEventListener('click', () => fileInput.click());
21
 
22
  fileInput.addEventListener('change', handleFileSelection);
 
183
  };
184
  }
185
  async function extractTextFromPDF(file) {
186
+ return new Promise(async (resolve, reject) => {
187
  const reader = new FileReader();
188
 
189
  reader.onload = async function(event) {
190
  try {
191
  const typedArray = new Uint8Array(event.target.result);
192
+
193
+ // Enhanced PDF loading with multiple extraction strategies
194
  const loadingTask = pdfjsLib.getDocument({
195
  data: typedArray,
196
+ cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
197
  cMapPacked: true,
198
+ standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/',
199
+ useSystemFonts: true,
200
+ useWorkerFetch: true,
201
+ isEvalSupported: false,
202
+ disableAutoFetch: false,
203
+ disableStream: false
204
  });
205
 
206
  const pdf = await loadingTask.promise;
207
  let fullText = '';
208
+ let metadata = await pdf.getMetadata();
209
 
210
+ // Strategy 1: Enhanced text extraction with structural analysis
211
  for (let i = 1; i <= pdf.numPages; i++) {
212
  const page = await pdf.getPage(i);
213
+
214
+ // Get viewport for better text positioning
215
+ const viewport = page.getViewport({ scale: 2.0 });
216
+
217
+ // Enhanced text content extraction
218
  const textContent = await page.getTextContent({
219
  normalizeWhitespace: false,
220
+ disableCombineTextItems: false,
221
+ includeMarkedContent: true
222
  });
223
 
224
+ // Process text items with better grouping
225
+ const textItems = textContent.items;
226
+ let pageText = '';
227
+ let lastY = null;
228
+ let lastX = null;
229
 
230
+ for (let j = 0; j < textItems.length; j++) {
231
+ const item = textItems[j];
232
+ const tx = pdfjsLib.Util.transform(
233
+ viewport.transform,
234
+ item.transform
235
+ );
236
+ const x = tx[4];
237
+ const y = tx[5];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
+ // Add line breaks based on Y position
240
+ if (lastY !== null && Math.abs(y - lastY) > item.height * 0.8) {
241
+ pageText += '\n';
242
  }
243
 
244
+ // Add spaces based on X position
245
+ if (lastX !== null && x - lastX > item.width * 0.3) {
246
+ pageText += ' ';
247
+ }
248
+
249
+ pageText += item.str;
250
+ lastY = y;
251
+ lastX = x + item.width;
252
+ }
253
+
254
+ // Clean up and format the text
255
+ pageText = pageText
256
+ .replace(/\s+/g, ' ')
257
+ .replace(/\n\s*\n/g, '\n\n')
258
+ .trim();
259
+
260
+ if (pageText) {
261
+ fullText += pageText + '\n\n';
262
  }
263
  }
264
 
265
+ // Strategy 2: Enhanced Turkish character decoding
266
+ fullText = decodeTurkishText(fullText);
267
+
268
+ // Strategy 3: If still poor quality, try OCR with preprocessing
269
+ if (!fullText.trim() || fullText.trim().length < 50) {
270
+ console.warn('Primary text extraction failed, attempting enhanced OCR...');
271
+ fullText = await enhancedOCRFallback(typedArray);
272
+ }
273
+
274
+ // Strategy 4: Apply text quality improvements
275
+ fullText = improveTextQuality(fullText);
276
+
277
  resolve(fullText);
278
+
279
+ } catch (error) {
280
  console.error('PDF extraction error:', error);
281
  reject(new Error('Failed to extract text from PDF: ' + error.message));
282
  }
 
286
  reader.readAsArrayBuffer(file);
287
  });
288
  }
289
+
290
+ // Enhanced Turkish text decoding
291
+ function decodeTurkishText(text) {
292
+ // Multiple encoding fixes for Turkish characters
293
+ const fixes = [
294
+ // UTF-8 double encoding
295
+ [/\u00C3\u00A7/g, 'ç'], [/\u00C3\u0087/g, 'Ç'],
296
+ [/\u00C3\u011F/g, 'ğ'], [/\u00C4\u0178/g, 'Ğ'],
297
+ [/\u00C3\u00BC/g, 'ü'], [/\u00C3\u009C/g, 'Ü'],
298
+ [/\u00C3\u015F/g, 'ş'], [/\u00C5\u0178/g, 'Ş'],
299
+ [/\u00C3\u0131/g, 'ı'], [/\u00C4\u0131/g, 'İ'],
300
+ [/\u00C3\u00B6/g, 'ö'], [/\u00C3\u0096/g, 'Ö'],
301
 
302
+ // ISO-8859-9 to UTF-8
303
+ [/[\u00C4\u00E4]/g, 'ä'], [/[\u00C5\u00E5]/g, 'å'],
304
+ [/[\u00C6\u00E6]/g, 'æ'], [/[\u00C7\u00E7]/g, 'ç'],
305
+ [/[\u00D0\u00F0]/g, 'ð'], [/[\u011E\u011F]/g, 'ğ'],
306
+ [/[\u0130\u0131]/g, 'ı'], [/[\u015E\u015F]/g, 'ş'],
307
+ [/[\u00D6\u00F6]/g, 'ö'], [/[\u00DC\u00FC]/g, 'ü'],
308
+ [/[\u00DE\u00FE]/g, 'þ'],
309
 
310
+ // Common OCR errors
311
+ [/c/g, 'ç', { context: 'turkish' }], [/C/g, 'Ç', { context: 'turkish' }],
312
+ [/g/g, 'ğ', { context: 'turkish' }], [/G/g, 'Ğ', { context: 'turkish' }],
313
+ [/i/g, 'ı', { context: 'turkish' }], [/I/g, 'İ', { context: 'turkish' }],
314
+ [/o/g, 'ö', { context: 'turkish' }], [/O/g, 'Ö', { context: 'turkish' }],
315
+ [/s/g, 'ş', { context: 'turkish' }], [/S/g, 'Ş', { context: 'turkish' }],
316
+ [/u/g, 'ü', { context: 'turkish' }], [/U/g, 'Ü', { context: 'turkish' }]
317
+ ];
318
+
319
+ let decodedText = text;
320
+ fixes.forEach(fix => {
321
+ if (Array.isArray(fix) && fix.length === 2) {
322
+ decodedText = decodedText.replace(fix[0], fix[1]);
323
+ }
324
  });
325
+
326
+ // Apply HTML entity decoding if needed
327
+ try {
328
+ decodedText = he.decode(decodedText);
329
+ } catch (e) {
330
+ console.warn('HTML decoding failed:', e);
331
+ }
332
+
333
+ return decodedText;
334
  }
335
 
336
+ // Enhanced OCR fallback with multiple engines
337
+ async function enhancedOCRFallback(pdfData) {
338
+ const images = await convertPDFToImagesEnhanced(pdfData);
339
+ let ocrResults = [];
340
+
341
+ for (const image of images) {
342
+ // Try multiple OCR approaches
343
+ const results = await Promise.allSettled([
344
+ // Tesseract with Turkish and English
345
+ extractTextWithTesseract(image, 'tur+eng'),
346
+ // Tesseract with additional preprocessing
347
+ extractTextWithTesseract(image, 'tur+eng', { preprocess: true }),
348
+ // Fallback to English only if Turkish fails
349
+ extractTextWithTesseract(image, 'eng')
350
+ ]);
351
 
352
+ // Find the best result
353
+ let bestResult = '';
354
+ let maxLength = 0;
355
+
356
+ results.forEach(result => {
357
+ if (result.status === 'fulfilled' && result.value.length > maxLength) {
358
+ bestResult = result.value;
359
+ maxLength = result.value.length;
 
 
 
 
 
 
360
  }
361
+ });
362
+
363
+ if (bestResult) {
364
+ ocrResults.push(bestResult);
365
+ }
366
+ }
367
+
368
+ return ocrResults.join('\n\n') || 'OCR processing completed but no text was extracted.';
369
+ }
370
+
371
+ // Enhanced Tesseract extraction
372
+ async function extractTextWithTesseract(image, languages = 'tur+eng', options = {}) {
373
+ try {
374
+ const config = {
375
+ logger: m => console.log(`Tesseract: ${m.status} - ${Math.round(m.progress * 100)}%`),
376
+ preserve_interword_spaces: '1',
377
+ tessedit_pageseg_mode: '6',
378
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" ğüşıöçĞÜŞİÖÇ@#$%^&*+=<>:;_ ',
379
+ load_system_dawg: '1',
380
+ load_freq_dawg: '1'
381
  };
382
 
383
+ if (options.preprocess) {
384
+ // Apply image preprocessing
385
+ image = await preprocessImage(image);
386
+ }
387
+
388
+ const result = await Tesseract.recognize(image, languages, config);
389
+ return result.data.text;
390
+ } catch (error) {
391
+ console.error('Tesseract OCR error:', error);
392
+ throw error;
393
+ }
394
+ }
395
+
396
+ // Image preprocessing for better OCR
397
+ async function preprocessImage(canvas) {
398
+ const ctx = canvas.getContext('2d');
399
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
400
+ const data = imageData.data;
401
+
402
+ // Convert to grayscale
403
+ for (let i = 0; i < data.length; i += 4) {
404
+ const gray = data[i] * 0.299 + data[i + 1] * 0.587 + data[i + 2] * 0.114;
405
+ data[i] = gray;
406
+ data[i + 1] = gray;
407
+ data[i + 2] = gray;
408
+ }
409
+
410
+ // Apply adaptive thresholding
411
+ const threshold = 128;
412
+ for (let i = 0; i < data.length; i += 4) {
413
+ const value = data[i] > threshold ? 255 : 0;
414
+ data[i] = value;
415
+ data[i + 1] = value;
416
+ data[i + 2] = value;
417
+ }
418
+
419
+ ctx.putImageData(imageData, 0, 0);
420
+ return canvas;
421
  }
422
+
423
+ // Enhanced PDF to image conversion
424
+ async function convertPDFToImagesEnhanced(pdfData) {
425
  const loadingTask = pdfjsLib.getDocument({
426
  data: pdfData,
427
+ cMapUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/cmaps/',
428
  cMapPacked: true,
429
+ standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/standard_fonts/'
430
  });
431
 
432
  const pdf = await loadingTask.promise;
433
  const images = [];
434
 
435
+ // Process all pages with higher resolution
436
+ for (let i = 1; i <= Math.min(pdf.numPages, 10); i++) {
437
  const page = await pdf.getPage(i);
438
+ const viewport = page.getViewport({ scale: 3.0 });
439
+
440
  const canvas = document.createElement('canvas');
441
  const context = canvas.getContext('2d');
442
 
443
  canvas.height = viewport.height;
444
  canvas.width = viewport.width;
445
 
446
+ // Render with better quality
447
  await page.render({
448
  canvasContext: context,
449
+ viewport: viewport,
450
+ renderInteractiveForms: true,
451
+ intent: 'print'
452
  }).promise;
453
 
454
  images.push(canvas);
455
  }
456
 
457
+ return images;
458
+ }
459
+
460
+ // Text quality improvement
461
+ function improveTextQuality(text) {
462
+ return text
463
+ // Fix common OCR errors in Turkish
464
+ .replace(/\bi\b/g, 'ı') // Turkish dotless i
465
+ .replace(/\bI\b/g, 'İ') // Turkish capital I with dot
466
+ .replace(/c([aeiou])/gi, 'ç$1') // c followed by vowel -> ç
467
+ .replace(/C([AEIOU])/g, 'Ç$1')
468
+ .replace(/g([aeiou])/gi, 'ğ$1') // g followed by vowel -> ğ
469
+ .replace(/G([AEIOU])/g, 'Ğ$1')
470
+ .replace(/s([aeiou])/gi, 'ş$1') // s followed by vowel -> ş
471
+ .replace(/S([AEIOU])/g, 'Ş$1')
472
+ .replace(/o([aeiou])/gi, 'ö$1') // o followed by vowel -> ö
473
+ .replace(/O([AEIOU])/g, 'Ö$1')
474
+ .replace(/u([aeiou])/gi, 'ü$1') // u followed by vowel -> ü
475
+ .replace(/U([AEIOU])/g, 'Ü$1')
476
+
477
+ // Clean up spacing
478
+ .replace(/\s+/g, ' ')
479
+ .replace(/\n\s*\n/g, '\n\n')
480
+
481
+ // Fix common character confusion
482
+ .replace(/0/g, 'O', { condition: context => /[A-Z]/.test(context.after) })
483
+ .replace(/1/g, 'I', { condition: context => /[A-Z]/.test(context.after) })
484
+ .replace(/5/g, 'S', { condition: context => /[A-Z]/.test(context.after) })
485
+ .trim();
486
+ }
487
+ async function extractTextFromWord(file) {
488
+ return new Promise(async (resolve, reject) => {
489
+ const reader = new FileReader();
490
+
491
+ reader.onload = async function(event) {
492
+ try {
493
+ // Enhanced Word document extraction
494
+ const result = await mammoth.extractRawText({
495
+ arrayBuffer: event.target.result,
496
+ options: {
497
+ includeDefaultStyleMap: true,
498
+ styleMap: [
499
+ "p[style-name='Heading 1'] => h1:fresh",
500
+ "p[style-name='Heading 2'] => h2:fresh",
501
+ "p[style-name='Heading 3'] => h3:fresh",
502
+ "p[style-name='Title'] => h1.title:fresh",
503
+ "r[style-name='Strong'] => strong",
504
+ "r[style-name='Emphasis'] => em"
505
+ ]
506
+ }
507
+ });
508
+
509
+ let text = result.value;
510
+
511
+ // Apply Turkish character decoding
512
+ text = decodeTurkishText(text);
513
+
514
+ // Apply text quality improvements
515
+ text = improveTextQuality(text);
516
+
517
+ // Try alternative extraction if result is poor
518
+ if (text.trim().length < 50) {
519
+ console.warn('Primary Word extraction failed, trying alternative...');
520
+ const altResult = await mammoth.convertToMarkdown({
521
+ arrayBuffer: event.target.result
522
+ });
523
+
524
+ if (altResult.value && altResult.value.trim().length > text.trim().length) {
525
+ text = altResult.value;
526
+ text = decodeTurkishText(text);
527
+ text = improveTextQuality(text);
528
+ }
529
+ }
530
+
531
+ resolve(text);
532
+ } catch (error) {
533
+ reject(error);
534
  }
535
+ };
536
+
537
+ reader.onerror = reject;
538
+ reader.readAsArrayBuffer(file);
539
+ });
540
+ }
541
+ async function extractTextFromExcel(file) {
542
+ return new Promise(async (resolve, reject) => {
543
+ const reader = new FileReader();
544
+
545
+ reader.onload = async function(event) {
546
+ try {
547
+ const data = new Uint8Array(event.target.result);
548
+
549
+ // Enhanced Excel reading with Turkish support
550
+ const workbook = XLSX.read(data, {
551
+ type: 'array',
552
+ codepage: 1254, // Turkish codepage
553
+ cellStyles: true,
554
+ cellHTML: false
555
+ });
556
+
557
+ const result = {};
558
+
559
+ workbook.SheetNames.forEach(sheetName => {
560
+ const worksheet = workbook.Sheets[sheetName];
561
+
562
+ // Try multiple extraction methods
563
+ const jsonData = XLSX.utils.sheet_to_json(worksheet, {
564
+ header: 1,
565
+ raw: false,
566
+ dateNF: 'dd/mm/yyyy',
567
+ defval: ''
568
+ });
569
 
570
+ const csvData = XLSX.utils.sheet_to_csv(worksheet, {
571
+ FS: '\t',
572
+ RS: '\n',
573
+ dateNF: 'dd/mm/yyyy'
574
+ });
575
+
576
+ // Process data with Turkish character support
577
+ const processedData = jsonData.map(row =>
578
+ row.map(cell => {
579
+ if (typeof cell === 'string') {
580
+ return decodeTurkishText(improveTextQuality(cell));
581
+ }
582
+ return cell;
583
+ })
584
+ );
585
+
586
+ result[sheetName] = {
587
+ data: processedData,
588
+ csv: decodeTurkishText(csvData),
589
+ range: worksheet['!ref'] || '',
590
+ rowCount: jsonData.length,
591
+ colCount: jsonData[0] ? jsonData[0].length : 0
592
+ };
593
+ });
594
+
595
+ resolve(result);
596
+ } catch (error) {
597
+ reject(error);
598
+ }
599
+ };
600
+
601
+ reader.onerror = reject;
602
+ reader.readAsArrayBuffer(file);
603
+ });
604
+ }
605
+ async function convertPDFToImages(pdfData) {
606
+ return await convertPDFToImagesEnhanced(pdfData);
607
+ }
608
+ async function extractTextFromImage(file) {
609
+ return new Promise(async (resolve, reject) => {
610
+ try {
611
+ // Apply learned corrections before OCR
612
+ let trainedWords = {};
613
+ if (window.ocrLearningDict) {
614
+ for (const [word, data] of Object.entries(window.ocrLearningDict)) {
615
+ if (data.confirmedCorrect && data.confirmedCorrect !== word) {
616
+ trainedWords[word] = data.confirmedCorrect;
617
+ }
618
+ }
619
+ }
620
+
621
+ const imageElement = file instanceof HTMLCanvasElement ? file : file;
622
+
623
+ // Enhanced OCR configuration
624
+ const config = {
625
+ logger: m => {
626
+ if (m.status === 'recognizing text') {
627
+ console.log(`OCR Progress: ${Math.round(m.progress * 100)}%`);
628
+ }
629
+ },
630
+ preserve_interword_spaces: '1',
631
+ tessedit_pageseg_mode: '6', // Assume uniform text block
632
+ tessedit_char_whitelist: 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,!?-(){}[]/\\\'" @#$%^&*+=<>:;_ ğüşıöçĞÜŞİÖÇ',
633
  user_defined_words: Object.keys(trainedWords).join(' '),
634
+ tessedit_create_hocr: '1',
635
+ load_system_dawg: '1',
636
+ load_freq_dawg: '1',
 
637
  user_words_suffix: 'tur',
638
+ user_patterns_suffix: 'tur',
639
+ tessedit_ocr_engine_mode: '1', // LSTM OCR engine
640
+ tessedit_do_ocr: '1',
641
+ tessedit_load_image: '1'
642
+ };
643
+
644
+ // Try multiple OCR approaches
645
+ const results = await Promise.allSettled([
646
+ // Primary: Turkish + English with enhanced preprocessing
647
+ performOCRWithPreprocessing(imageElement, 'tur+eng', config),
648
+ // Secondary: Different page segmentation
649
+ Tesseract.recognize(imageElement, 'tur+eng', {
650
+ ...config,
651
+ tessedit_pageseg_mode: '1' // Automatic page segmentation
652
+ }),
653
+ // Tertiary: Only English if Turkish fails
654
+ Tesseract.recognize(imageElement, 'eng', config)
655
+ ]);
656
+
657
+ // Find and return the best result
658
+ let bestResult = { text: '', confidence: 0 };
659
+
660
+ results.forEach(result => {
661
+ if (result.status === 'fulfilled') {
662
+ const text = result.value.text;
663
+ const confidence = calculateConfidence(text);
664
+
665
+ if (text.trim().length > bestResult.text.length ||
666
+ (text.trim().length === bestResult.text.length && confidence > bestResult.confidence)) {
667
+ bestResult = { text, confidence };
668
+ }
669
+ }
670
+ });
671
+
672
+ if (bestResult.text) {
673
+ // Apply text quality improvements
674
+ bestResult.text = decodeTurkishText(bestResult.text);
675
+ bestResult.text = improveTextQuality(bestResult.text);
676
+
677
+ if (outputFormat.value === 'formatted') {
678
+ // Create formatted output
679
+ const formatted = createFormattedText(bestResult.text);
680
+ resolve(formatted);
681
+ } else {
682
+ resolve(bestResult.text);
683
+ }
684
  } else {
685
+ resolve('No text could be extracted from the image.');
686
  }
687
+
688
+ } catch (error) {
689
+ console.error('Enhanced image OCR error:', error);
690
+ reject(error);
691
+ }
692
  });
693
+
694
+ // OCR with image preprocessing
695
+ async function performOCRWithPreprocessing(image, languages, config) {
696
+ let processedImage = image;
697
+
698
+ if (image instanceof HTMLCanvasElement) {
699
+ // Apply preprocessing to canvas
700
+ processedImage = await preprocessImage(image);
701
+ }
702
+
703
+ return await Tesseract.recognize(processedImage, languages, config);
704
+ }
705
+
706
+ // Calculate text confidence score
707
+ function calculateConfidence(text) {
708
+ if (!text || text.trim().length === 0) return 0;
709
+
710
+ // Score based on Turkish word detection
711
+ const turkishWords = text.match(/[ğüşıöçĞÜŞİÖÇ]+/g) || [];
712
+ const wordCount = text.split(/\s+/).length;
713
+ const turkishRatio = turkishWords.length / wordCount;
714
+
715
+ // Score based on sentence structure
716
+ const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
717
+ const avgSentenceLength = sentences.reduce((sum, s) => sum + s.split(/\s+/).length, 0) / sentences.length;
718
+
719
+ // Combined confidence score
720
+ return (turkishRatio * 0.5) + (Math.min(avgSentenceLength / 10, 1) * 0.5);
721
+ }
722
+
723
+ // Create formatted text output
724
+ function createFormattedText(text) {
725
+ return text
726
+ .replace(/([.!?])\s+/g, '$1\n\n') // Better paragraph breaks
727
+ .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
728
+ .replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 ') // Preserve Turkish words
729
+ .trim();
730
+ }
731
+ function processFormattedOCR(hocr) {
732
  // Apply learned corrections
733
  if (window.ocrLearningDict) {
734
  for (const [word, data] of Object.entries(window.ocrLearningDict)) {