Spaces:
Sleeping
Sleeping
import gradio as gr | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
# Download stopwords if not already present | |
nltk.download('stopwords') | |
# Sample Arabic stopwords (add more if needed) | |
# Read the content of the file and store it in a list | |
file_path = "list.txt" | |
# Initialize an empty list | |
lines_list = [] | |
# Open the file and read the content line by line | |
with open(file_path, 'r') as file: | |
# Read each line and append it to the list | |
lines_list = file.readlines() | |
# Strip any trailing newline characters from each line | |
arabic_stopwords = [line.strip() for line in lines_list] | |
# Function to clean text (removes usernames, URLs, and extra whitespaces) | |
def clean_text(text): | |
text = re.sub(r'@\w+|RT', '', text) # Remove usernames | |
text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs | |
text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespaces | |
return text | |
# Function to remove Arabic and English stopwords | |
def remove_stopwords(text): | |
words = text.split() | |
english_stopwords = set(stopwords.words('english')) # English stopwords | |
combined_stopwords = set(arabic_stopwords).union(english_stopwords) | |
return ' '.join([word for word in words if word not in combined_stopwords]) | |
# Function to generate word cloud | |
def generate_wordcloud(text_array): | |
text_data = ' '.join(text_array) | |
clean_text_data = clean_text(text_data) | |
clean_text_data = remove_stopwords(clean_text_data) | |
# Generate the word cloud | |
wordcloud = WordCloud(font_path='Amiri-Regular.ttf', | |
background_color='white', | |
width=800, | |
height=600, | |
colormap='tab20c').generate(clean_text_data) | |
plt.figure(figsize=(10, 8)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.axis('off') | |
plt.savefig('wordcloud.png') | |
return 'wordcloud.png' | |
# Gradio Interface | |
def gradio_interface(text_input): | |
text_array = text_input.split('\n') # Split input by new lines | |
return generate_wordcloud(text_array) | |
# Create Gradio Interface | |
interface = gr.Interface(fn=gradio_interface, | |
inputs=gr.Textbox(lines=10, placeholder="Enter text data (one sentence per line)"), | |
outputs="image", | |
title="Arabic Word Cloud Generator", | |
description="Generate a word cloud from Arabic text after cleaning and stopword removal.") | |
# Launch Gradio Interface | |
interface.launch() |