File size: 2,581 Bytes
196ce39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Download stopwords if not already present
nltk.download('stopwords')

# Sample Arabic stopwords (add more if needed)
# Read the content of the file and store it in a list
file_path = "list.txt"

# Initialize an empty list
lines_list = []

# Open the file and read the content line by line
with open(file_path, 'r') as file:
    # Read each line and append it to the list
    lines_list = file.readlines()

# Strip any trailing newline characters from each line
arabic_stopwords = [line.strip() for line in lines_list]

# Function to clean text (removes usernames, URLs, and extra whitespaces)
def clean_text(text):
    text = re.sub(r'@\w+|RT', '', text)    # Remove usernames
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

# Function to remove Arabic and English stopwords
def remove_stopwords(text):
    words = text.split()
    english_stopwords = set(stopwords.words('english'))  # English stopwords
    combined_stopwords = set(arabic_stopwords).union(english_stopwords)
    return ' '.join([word for word in words if word not in combined_stopwords])

# Function to generate word cloud
def generate_wordcloud(text_array):
    text_data = ' '.join(text_array)
    clean_text_data = clean_text(text_data)
    clean_text_data = remove_stopwords(clean_text_data)
    
    # Generate the word cloud
    wordcloud = WordCloud(font_path='Amiri-Regular.ttf', 
                          background_color='white',
                          width=800,
                          height=600,
                          colormap='tab20c').generate(clean_text_data)

    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('wordcloud.png')
    return 'wordcloud.png'

# Gradio Interface
def gradio_interface(text_input):
    text_array = text_input.split('\n')  # Split input by new lines
    return generate_wordcloud(text_array)

# Create Gradio Interface
interface = gr.Interface(fn=gradio_interface, 
                         inputs=gr.Textbox(lines=10, placeholder="Enter text data (one sentence per line)"), 
                         outputs="image",
                         title="Arabic Word Cloud Generator",
                         description="Generate a word cloud from Arabic text after cleaning and stopword removal.")

# Launch Gradio Interface
interface.launch()