-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
167 lines (134 loc) · 5.89 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import streamlit as st
import nltk
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import readtime
import textstat
from io import StringIO
import heapq
from nltk import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import plotly.express as px
from streamlit_option_menu import option_menu
# Navbar
selected = option_menu (
menu_title = None,
options = ['URL Analysis', 'Text Complexity Analysis', 'Dataset Analysis'],
icons = ['link-45deg', 'file-text', 'bar-chart-line'],
menu_icon = 'cast',
default_index = 0,
orientation = 'horizontal',
)
st.title("Welcome to EcoRead")
# Introducing the user how to use the application
st.markdown('___')
st.write(':point_left: Use the sidebar on the left to learn about EcoRead (click on > if closed).')
st.divider()
# SIDEBAR
st.sidebar.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>What is this App about?<b></h3>", unsafe_allow_html=True)
st.sidebar.info(
"""
Learning happens best when content is personalised to meet our needs and strengths.
"""
)
st.sidebar.write("---")
st.sidebar.markdown(
"""
EcoRead allows you to analyse the sentiment of a pre-existing dataset, summarise the contents of an article for quicker information digestion as we well as analyse the complexity of a text.
"""
)
st.sidebar.divider()
st.sidebar.write("Are you into NLP? Our code is 100% open source and written for easy understanding. Fork it from [GitHub](https://github.com/akebu6/EcoRead), and pull any suggestions you may have. Become part of the community! Help yourself and help others :smiley:")
st.sidebar.divider()
st.sidebar.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>Who is this App for?<b></h3>", unsafe_allow_html=True)
st.sidebar.write("Anyone can use this App completely for free! If you like it :heart:, show your support by sharing :+1: ")
st.sidebar.divider()
#CONTACT
########
expander = st.sidebar.expander('Contact')
expander.write(
"I'd love your feedback :smiley: Want to collaborate? Develop a project? Find me on [LinkedIn](https://www.linkedin.com/in/akebu-simasiku-24186720a/) and [Twitter](https://twitter.com/akebu)"
)
# Function to extract text content from a URL
def extract_text_from_url(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p') # Adjust this based on the HTML structure
text = ' '.join([para.get_text() for para in paragraphs])
return text
# Function to generate a summarized version of the text
def generate_summary(text, num_sentences=3):
sentences = sent_tokenize(text)
words = [word for word in text.split() if word.lower() not in stopwords.words('english')]
word_freq = FreqDist(words)
ranking = {}
for i, sentence in enumerate(sentences):
for word, freq in word_freq.items():
if word in sentence.lower():
if i not in ranking:
ranking[i] = freq
else:
ranking[i] += freq
selected_sentences = heapq.nlargest(num_sentences, ranking, key=ranking.get)
summary = [sentences[i] for i in sorted(selected_sentences)]
return ' '.join(summary)
# User Interaction
if selected == "URL Analysis":
url = st.text_input("Enter the URL of the climate change article and press Enter or Return: 👇 ", placeholder="Enter a valid URL")
if st.button('Search'):
with st.spinner('Processing...'):
time.sleep(2)
text = extract_text_from_url(url)
summary = generate_summary(text)
st.write("\nSummary:")
st.write(summary)
# Uploading a text file
if selected == 'Text Complexity Analysis':
file = st.file_uploader('Upload your file here', type=['txt'])
if file is not None:
time.sleep(2)
stringio = StringIO(file.getvalue().decode("utf-8"))
string_data = stringio.read()
if len(string_data) > 10000:
st.error('Please upload a file of maximum 10,000 characters')
else:
rt = readtime.of_text(string_data)
tc = textstat.flesch_reading_ease(string_data)
tokenized_words = word_tokenize(string_data)
lr = len(set(tokenized_words)) / len(tokenized_words)
lr = round(lr,2)
n_s = textstat.sentence_count(string_data)
st.markdown('___')
st.text('Reading Time')
st.write(rt)
st.markdown('___')
st.text('Text Complexity: from 0 or negative (hard to read), to 100 or more (easy to read)')
st.write(tc)
st.markdown('___')
st.text('Lexical Richness (distinct words over total number of words)')
st.write(lr)
st.markdown('___')
st.text('Number of sentences')
st.write(n_s)
# st.balloons()
# Dataset Analysiso plot the data
if selected == "Dataset Analysis":
PATH = "./csv_files/"
DATA = "twitter_sentiment_data.csv"
data = pd.read_csv(PATH + DATA)
# set up label dataframe for future refrences
label = [-1, 0, 1, 2]
labelN = ["Anti", "Neutral", "Pro", "News"]
labelDesc = [
"the tweet does not believe in man-made climate change"
, "the tweet neither supports nor refutes the belief of man-made climate change"
, "the tweet supports the belief of man-made climate change"
, "the tweet links to factual news about climate change"
]
labelDf = pd.DataFrame(list(zip(label, labelN, labelDesc)), columns=["label", "name", "description"])
fig = px.pie(data.sentiment.value_counts().values,
data.sentiment.value_counts().index, title='Sentiment Distribution of the Tweet Dataset')
st.plotly_chart(fig)