ASDAD34 commited on
Commit
d9e45b7
·
verified ·
1 Parent(s): 4b28095

İçerik diye bunu vermiş {

Browse files

"fileName": "Çalışma Bakanlığı Tespit ve Noksanlıklar.pdf",
"fileType": "application/pdf",
"fileSize": 15315231,
"content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
"extractedAt": "2025-11-29T22:46:17.322Z"
}

Files changed (2) hide show
  1. script.js +97 -38
  2. style.css +19 -8
script.js CHANGED
@@ -146,43 +146,41 @@ document.addEventListener('DOMContentLoaded', function() {
146
  } else {
147
  throw new Error('Unsupported file type: ' + file.type);
148
  }
149
-
150
  // Convert content to requested format
151
  let formattedContent;
 
 
 
 
152
  if (format === 'json') {
153
  formattedContent = {
154
  fileName: file.name,
155
  fileType: file.type,
156
  fileSize: file.size,
157
- content: content,
158
  extractedAt: new Date().toISOString()
159
  };
 
160
  formattedContent = JSON.stringify(formattedContent, null, 2);
161
  } else if (format === 'markdown') {
162
  formattedContent = `# ${file.name}\n\n`;
163
- if (typeof content === 'string') {
164
- formattedContent += content;
165
- } else {
166
- formattedContent += JSON.stringify(content, null, 2)
167
- .replace(/\n/g, '\n\n')
168
- .replace(/"([^"]+)":/g, '**$1**:');
169
- }
170
  } else {
171
- // Plain text
172
- if (typeof content === 'string') {
173
- formattedContent = content;
174
- } else {
175
- formattedContent = JSON.stringify(content, null, 2);
176
- }
177
  }
178
-
179
- return {
180
  fileName: file.name,
181
  content: formattedContent,
182
  format: format
183
  };
184
  }
185
-
186
  async function extractTextFromPDF(file) {
187
  return new Promise((resolve, reject) => {
188
  const reader = new FileReader();
@@ -190,28 +188,64 @@ document.addEventListener('DOMContentLoaded', function() {
190
  reader.onload = async function(event) {
191
  try {
192
  const typedArray = new Uint8Array(event.target.result);
193
- const pdf = await pdfjsLib.getDocument(typedArray).promise;
194
- let text = '';
 
 
 
 
 
 
 
195
 
196
  for (let i = 1; i <= pdf.numPages; i++) {
197
  const page = await pdf.getPage(i);
198
- const content = await page.getTextContent();
199
- const strings = content.items.map(item => item.str);
200
- text += strings.join(' ') + '\n\n';
 
 
 
 
 
 
 
 
 
201
  }
202
 
203
- resolve(text);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  } catch (error) {
205
- reject(error);
 
206
  }
207
  };
208
 
209
- reader.onerror = reject;
210
  reader.readAsArrayBuffer(file);
211
  });
212
  }
213
-
214
- async function extractTextFromWord(file) {
215
  return new Promise((resolve, reject) => {
216
  const reader = new FileReader();
217
 
@@ -320,7 +354,6 @@ async function extractTextFromImage(file) {
320
  return formattedText;
321
  }
322
  }
323
-
324
  function displayResult(result) {
325
  const resultCard = document.createElement('div');
326
  resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm';
@@ -342,25 +375,52 @@ async function extractTextFromImage(file) {
342
 
343
  const content = document.createElement('div');
344
 
 
 
 
 
 
 
345
  if (result.format === 'json') {
346
- const pre = document.createElement('pre');
347
- pre.textContent = result.content;
348
- content.appendChild(pre);
 
 
 
349
  } else {
350
- const pre = document.createElement('pre');
351
  pre.textContent = result.content;
352
- content.appendChild(pre);
353
  }
354
 
 
 
355
  resultCard.appendChild(header);
356
  resultCard.appendChild(content);
357
 
358
  resultsContainer.appendChild(resultCard);
359
  feather.replace();
360
  }
361
-
362
  function downloadResult(result) {
363
- const blob = new Blob([result.content], { type: 'text/plain' });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  const url = URL.createObjectURL(blob);
365
  const a = document.createElement('a');
366
  a.href = url;
@@ -370,8 +430,7 @@ async function extractTextFromImage(file) {
370
  document.body.removeChild(a);
371
  URL.revokeObjectURL(url);
372
  }
373
-
374
- downloadAllBtn.addEventListener('click', () => {
375
  processedResults.forEach(result => {
376
  downloadResult(result);
377
  });
 
146
  } else {
147
  throw new Error('Unsupported file type: ' + file.type);
148
  }
 
149
  // Convert content to requested format
150
  let formattedContent;
151
+
152
+ // Ensure content is properly encoded for Turkish characters
153
+ const cleanContent = typeof content === 'string' ? content : JSON.stringify(content, null, 2);
154
+
155
  if (format === 'json') {
156
  formattedContent = {
157
  fileName: file.name,
158
  fileType: file.type,
159
  fileSize: file.size,
160
+ content: cleanContent,
161
  extractedAt: new Date().toISOString()
162
  };
163
+ // Use custom replacer to handle Turkish characters properly
164
  formattedContent = JSON.stringify(formattedContent, null, 2);
165
  } else if (format === 'markdown') {
166
  formattedContent = `# ${file.name}\n\n`;
167
+ formattedContent += cleanContent;
168
+ } else if (format === 'formatted') {
169
+ // Apply better formatting for Turkish text
170
+ formattedContent = cleanContent
171
+ .replace(/([.!?])\s*/g, '$1\n\n') // Better paragraph breaks
172
+ .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks
173
+ .replace(/([A-ZÇĞİÖŞÜ][a-zçğıöşü]+)\s+/g, '$1 '); // Preserve Turkish words
174
  } else {
175
+ // Plain text - ensure Turkish characters are preserved
176
+ formattedContent = cleanContent;
 
 
 
 
177
  }
178
+ return {
 
179
  fileName: file.name,
180
  content: formattedContent,
181
  format: format
182
  };
183
  }
 
184
  async function extractTextFromPDF(file) {
185
  return new Promise((resolve, reject) => {
186
  const reader = new FileReader();
 
188
  reader.onload = async function(event) {
189
  try {
190
  const typedArray = new Uint8Array(event.target.result);
191
+ const loadingTask = pdfjsLib.getDocument({
192
+ data: typedArray,
193
+ cMapUrl: 'https://cdn.jsdelivr.net/npm/[email protected]/cmaps/',
194
+ cMapPacked: true,
195
+ standardFontDataUrl: 'https://cdn.jsdelivr.net/npm/[email protected]/standard_fonts/'
196
+ });
197
+
198
+ const pdf = await loadingTask.promise;
199
+ let fullText = '';
200
 
201
  for (let i = 1; i <= pdf.numPages; i++) {
202
  const page = await pdf.getPage(i);
203
+ const textContent = await page.getTextContent({
204
+ normalizeWhitespace: false,
205
+ disableCombineTextItems: false
206
+ });
207
+
208
+ const pageText = textContent.items
209
+ .map(item => item.str)
210
+ .join(' ');
211
+
212
+ if (pageText.trim()) {
213
+ fullText += pageText + '\n\n';
214
+ }
215
  }
216
 
217
+ // Try to fix common Turkish character encoding issues
218
+ fullText = fullText
219
+ .replace(/\u00C3\u00A7/g, 'ç') // ç
220
+ .replace(/\u00C3\u0087/g, 'Ç') // Ç
221
+ .replace(/\u00C3\u011F/g, 'ğ') // ğ
222
+ .replace(/\u00C4\u0178/g, 'Ğ') // Ğ
223
+ .replace(/\u00C3\u00BC/g, 'ü') // ü
224
+ .replace(/\u00C3\u009C/g, 'Ü') // Ü
225
+ .replace(/\u00C3\u015F/g, 'ş') // ş
226
+ .replace(/\u00C5\u0178/g, 'Ş') // Ş
227
+ .replace(/\u00C3\u0131/g, 'ı') // ı
228
+ .replace(/\u00C4\u0131/g, 'İ') // İ
229
+ .replace(/\u00C3\u00B6/g, 'ö') // ö
230
+ .replace(/\u00C3\u0096/g, 'Ö'); // Ö
231
+
232
+ if (!fullText.trim()) {
233
+ console.warn('PDF text extraction returned empty content. The PDF might be scanned or contain only images.');
234
+ fullText = 'Warning: No extractable text found. This PDF might be scanned or contain only images. Try OCR processing if available.';
235
+ }
236
+
237
+ resolve(fullText);
238
  } catch (error) {
239
+ console.error('PDF extraction error:', error);
240
+ reject(new Error('Failed to extract text from PDF: ' + error.message));
241
  }
242
  };
243
 
244
+ reader.onerror = () => reject(new Error('Failed to read PDF file'));
245
  reader.readAsArrayBuffer(file);
246
  });
247
  }
248
+ async function extractTextFromWord(file) {
 
249
  return new Promise((resolve, reject) => {
250
  const reader = new FileReader();
251
 
 
354
  return formattedText;
355
  }
356
  }
 
357
  function displayResult(result) {
358
  const resultCard = document.createElement('div');
359
  resultCard.className = 'bg-gray-50 rounded-lg p-4 shadow-sm';
 
375
 
376
  const content = document.createElement('div');
377
 
378
+ // Create pre element with proper Turkish character support
379
+ const pre = document.createElement('pre');
380
+ pre.className = result.format === 'formatted' ? 'ocr-result turkish-text' : '';
381
+ pre.style.cssText = 'font-family: monospace; white-space: pre-wrap; word-wrap: break-word; line-height: 1.6;';
382
+
383
+ // Handle content display with proper encoding
384
  if (result.format === 'json') {
385
+ try {
386
+ const parsed = JSON.parse(result.content);
387
+ pre.textContent = JSON.stringify(parsed, null, 2);
388
+ } catch (e) {
389
+ pre.textContent = result.content;
390
+ }
391
  } else {
 
392
  pre.textContent = result.content;
 
393
  }
394
 
395
+ content.appendChild(pre);
396
+
397
  resultCard.appendChild(header);
398
  resultCard.appendChild(content);
399
 
400
  resultsContainer.appendChild(resultCard);
401
  feather.replace();
402
  }
 
403
  function downloadResult(result) {
404
+ // Set proper MIME type and encoding for Turkish characters
405
+ let mimeType = 'text/plain;charset=utf-8';
406
+ let content = result.content;
407
+
408
+ if (result.format === 'json') {
409
+ mimeType = 'application/json;charset=utf-8';
410
+ } else if (result.format === 'markdown') {
411
+ mimeType = 'text/markdown;charset=utf-8';
412
+ }
413
+
414
+ // Add UTF-8 BOM for better Turkish character support in some applications
415
+ const bom = new Uint8Array([0xEF, 0xBB, 0xBF]);
416
+ const encoder = new TextEncoder();
417
+ const contentBytes = encoder.encode(content);
418
+
419
+ const combinedBytes = new Uint8Array(bom.length + contentBytes.length);
420
+ combinedBytes.set(bom);
421
+ combinedBytes.set(contentBytes, bom.length);
422
+
423
+ const blob = new Blob([combinedBytes], { type: mimeType });
424
  const url = URL.createObjectURL(blob);
425
  const a = document.createElement('a');
426
  a.href = url;
 
430
  document.body.removeChild(a);
431
  URL.revokeObjectURL(url);
432
  }
433
+ downloadAllBtn.addEventListener('click', () => {
 
434
  processedResults.forEach(result => {
435
  downloadResult(result);
436
  });
style.css CHANGED
@@ -29,10 +29,11 @@
29
  word-wrap: break-word;
30
  max-height: 500px;
31
  overflow-y: auto;
32
- font-family: monospace;
33
- line-height: 1.5;
 
 
34
  }
35
-
36
  /* OCR confidence styling */
37
  pre [confidence-low] {
38
  background-color: #ffebee;
@@ -44,16 +45,26 @@ pre [confidence-medium] {
44
  color: #f57f17;
45
  padding: 0 2px;
46
  }
47
-
48
  /* Turkish specific OCR styling */
49
  .turkish-text {
50
- font-family: 'Noto Sans', sans-serif;
51
  line-height: 1.8;
52
- letter-spacing: 0.5px;
 
53
  }
54
 
55
  .ocr-result {
56
  white-space: pre-wrap;
57
- font-size: 1.1rem;
58
- font-family: 'Segoe UI', Tahoma, sans-serif;
 
 
 
 
 
 
 
 
 
 
59
  }
 
29
  word-wrap: break-word;
30
  max-height: 500px;
31
  overflow-y: auto;
32
+ font-family: 'Courier New', monospace;
33
+ line-height: 1.6;
34
+ unicode-bidi: embed;
35
+ direction: ltr;
36
  }
 
37
  /* OCR confidence styling */
38
  pre [confidence-low] {
39
  background-color: #ffebee;
 
45
  color: #f57f17;
46
  padding: 0 2px;
47
  }
 
48
  /* Turkish specific OCR styling */
49
  .turkish-text {
50
+ font-family: 'Segoe UI', 'Tahoma', 'Arial Unicode MS', 'Noto Sans', sans-serif;
51
  line-height: 1.8;
52
+ letter-spacing: 0.3px;
53
+ unicode-bidi: embed;
54
  }
55
 
56
  .ocr-result {
57
  white-space: pre-wrap;
58
+ font-size: 1rem;
59
+ font-family: 'Segoe UI', 'Tahoma', 'Arial Unicode MS', sans-serif;
60
+ unicode-bidi: embed;
61
+ direction: ltr;
62
+ color: #2d3748;
63
+ }
64
+
65
+ /* Ensure proper Turkish character rendering */
66
+ @supports (font-variation-settings: normal) {
67
+ .turkish-text, .ocr-result {
68
+ font-family: 'Segoe UI Variable', 'Segoe UI', 'Tahoma', 'Arial Unicode MS', 'Noto Sans', sans-serif;
69
+ }
70
  }