ASDAD34 commited on
Commit
bfa635c
·
verified ·
1 Parent(s): d9e45b7

Warning: No extractable text found. This PDF might be scanned or contain only images. Try OCR processing if available.

Browse files
Files changed (2) hide show
  1. script.js +77 -11
  2. style.css +9 -1
script.js CHANGED
@@ -228,14 +228,27 @@ return {
228
  .replace(/\u00C4\u0131/g, 'İ') // İ
229
  .replace(/\u00C3\u00B6/g, 'ö') // ö
230
  .replace(/\u00C3\u0096/g, 'Ö'); // Ö
231
-
232
  if (!fullText.trim()) {
233
- console.warn('PDF text extraction returned empty content. The PDF might be scanned or contain only images.');
234
- fullText = 'Warning: No extractable text found. This PDF might be scanned or contain only images. Try OCR processing if available.';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  }
236
 
237
  resolve(fullText);
238
- } catch (error) {
239
  console.error('PDF extraction error:', error);
240
  reject(new Error('Failed to extract text from PDF: ' + error.message));
241
  }
@@ -287,11 +300,48 @@ async function extractTextFromWord(file) {
287
  reader.readAsArrayBuffer(file);
288
  });
289
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  async function extractTextFromImage(file) {
291
- return new Promise((resolve, reject) => {
292
- Tesseract.recognize(
293
- file,
294
- 'tur+eng', // Turkish + English languages
 
 
 
 
295
  {
296
  logger: m => console.log(m),
297
  preserve_interword_spaces: true,
@@ -355,7 +405,10 @@ async function extractTextFromImage(file) {
355
  }
356
  }
357
  function displayResult(result) {
358
- const resultCard = document.createElement('div');
 
 
 
359
  resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm';
360
 
361
  const header = document.createElement('div');
@@ -372,10 +425,23 @@ async function extractTextFromImage(file) {
372
 
373
  header.appendChild(title);
374
  header.appendChild(downloadBtn);
375
-
376
  const content = document.createElement('div');
377
 
378
- // Create pre element with proper Turkish character support
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  const pre = document.createElement('pre');
380
  pre.className = result.format === 'formatted' ? 'ocr-result turkish-text' : '';
381
  pre.style.cssText = 'font-family: monospace; white-space: pre-wrap; word-wrap: break-word; line-height: 1.6;';
 
228
  .replace(/\u00C4\u0131/g, 'İ') // İ
229
  .replace(/\u00C3\u00B6/g, 'ö') // ö
230
  .replace(/\u00C3\u0096/g, 'Ö'); // Ö
 
231
  if (!fullText.trim()) {
232
+ console.warn('PDF text extraction returned empty content. Attempting OCR processing...');
233
+ try {
234
+ // Convert PDF to image for OCR processing
235
+ const images = await convertPDFToImages(typedArray);
236
+ let ocrText = '';
237
+
238
+ for (const image of images) {
239
+ const text = await extractTextFromImage(image);
240
+ ocrText += text + '\n\n';
241
+ }
242
+
243
+ fullText = ocrText || 'OCR processing attempted but no text was found.';
244
+ } catch (ocrError) {
245
+ console.error('OCR processing failed:', ocrError);
246
+ fullText = 'Warning: No extractable text found. OCR processing also failed: ' + ocrError.message;
247
+ }
248
  }
249
 
250
  resolve(fullText);
251
+ } catch (error) {
252
  console.error('PDF extraction error:', error);
253
  reject(new Error('Failed to extract text from PDF: ' + error.message));
254
  }
 
300
  reader.readAsArrayBuffer(file);
301
  });
302
  }
303
+ async function convertPDFToImages(pdfData) {
304
+ return new Promise(async (resolve) => {
305
+ const loadingTask = pdfjsLib.getDocument({
306
+ data: pdfData,
307
+ cMapUrl: 'https://cdn.jsdelivr.net/npm/[email protected]/cmaps/',
308
+ cMapPacked: true,
309
+ standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/[email protected]/standard_fonts/'
310
+ });
311
+
312
+ const pdf = await loadingTask.promise;
313
+ const images = [];
314
+
315
+ for (let i = 1; i <= Math.min(pdf.numPages, 5); i++) { // Limit to 5 pages
316
+ const page = await pdf.getPage(i);
317
+ const viewport = page.getViewport({ scale: 1.5 });
318
+ const canvas = document.createElement('canvas');
319
+ const context = canvas.getContext('2d');
320
+
321
+ canvas.height = viewport.height;
322
+ canvas.width = viewport.width;
323
+
324
+ await page.render({
325
+ canvasContext: context,
326
+ viewport: viewport
327
+ }).promise;
328
+
329
+ images.push(canvas);
330
+ }
331
+
332
+ resolve(images);
333
+ });
334
+ }
335
+
336
  async function extractTextFromImage(file) {
337
+ return new Promise((resolve, reject) => {
338
+ const imageElement = file instanceof HTMLCanvasElement ?
339
+ file :
340
+ file;
341
+
342
+ Tesseract.recognize(
343
+ imageElement,
344
+ 'tur+eng', // Turkish + English languages
345
  {
346
  logger: m => console.log(m),
347
  preserve_interword_spaces: true,
 
405
  }
406
  }
407
  function displayResult(result) {
408
+ // Check if this was an OCR fallback result
409
+ const isOCRResult = result.content.includes('OCR processing attempted') ||
410
+ result.content.includes('Warning: No extractable text found');
411
+ const resultCard = document.createElement('div');
412
  resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm';
413
 
414
  const header = document.createElement('div');
 
425
 
426
  header.appendChild(title);
427
  header.appendChild(downloadBtn);
 
428
  const content = document.createElement('div');
429
 
430
+ if (isOCRResult) {
431
+ const warning = document.createElement('div');
432
+ warning.className = 'pdf-ocr-warning';
433
+ warning.innerHTML = `
434
+ <div class="flex items-start">
435
+ <i data-feather="alert-triangle" class="mr-2"></i>
436
+ <div>
437
+ <strong>Note:</strong> This PDF was processed using OCR as no selectable text was found.
438
+ Results may contain errors or inaccuracies.
439
+ </div>
440
+ </div>
441
+ `;
442
+ content.appendChild(warning);
443
+ }
444
+ // Create pre element with proper Turkish character support
445
  const pre = document.createElement('pre');
446
  pre.className = result.format === 'formatted' ? 'ocr-result turkish-text' : '';
447
  pre.style.cssText = 'font-family: monospace; white-space: pre-wrap; word-wrap: break-word; line-height: 1.6;';
style.css CHANGED
@@ -52,9 +52,17 @@ pre [confidence-medium] {
52
  letter-spacing: 0.3px;
53
  unicode-bidi: embed;
54
  }
 
 
 
 
 
 
 
 
55
 
56
  .ocr-result {
57
- white-space: pre-wrap;
58
  font-size: 1rem;
59
  font-family: 'Segoe UI', 'Tahoma', 'Arial Unicode MS', sans-serif;
60
  unicode-bidi: embed;
 
52
  letter-spacing: 0.3px;
53
  unicode-bidi: embed;
54
  }
55
+ .pdf-ocr-warning {
56
+ background-color: #fff3cd;
57
+ color: #856404;
58
+ padding: 12px;
59
+ border-radius: 4px;
60
+ margin-bottom: 16px;
61
+ border-left: 4px solid #ffeeba;
62
+ }
63
 
64
  .ocr-result {
65
+ white-space: pre-wrap;
66
  font-size: 1rem;
67
  font-family: 'Segoe UI', 'Tahoma', 'Arial Unicode MS', sans-serif;
68
  unicode-bidi: embed;