Spaces:

raymondEDS
/

DS_webclass

Sleeping

raymondEDS

fixing navigation

1ecc668 6 months ago

8.43 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from wordcloud import WordCloud
	import string
	import io
	from contextlib import redirect_stdout
	import re

	# Define a simple list of common English stop words
	STOP_WORDS = {
	'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
	'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were',
	'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when',
	'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few',
	'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will',
	'just', 'should', 'now'
	}

	def simple_tokenize(text):
	"""Simple tokenization function that splits on whitespace and removes punctuation"""
	# Convert to lowercase
	text = text.lower()
	# Remove punctuation
	text = text.translate(str.maketrans('', '', string.punctuation))
	# Split on whitespace
	return text.split()

	def remove_stop_words(tokens):
	"""Remove stop words from a list of tokens"""
	return [word for word in tokens if word.lower() not in STOP_WORDS]

	def show():
	st.title("Week 4: Introduction to Natural Language Processing")

	# Introduction Section
	st.header("Course Overview")
	st.write("""
	In this course, you'll learn fundamental Natural Language Processing (NLP) concepts by exploring a fascinating real-world question:
	What is the effect of releasing a preprint of a paper before it is submitted for peer review?

	Using the ICLR (International Conference on Learning Representations) database - which contains submissions, reviews, and author profiles
	from 2017-2022 - you'll develop practical NLP skills while investigating potential biases and patterns in academic publishing.
	""")

	# Learning Path
	st.subheader("Learning Path")
	st.write("""
	1. Understanding Text as Data: How computers represent and work with text
	2. Text Processing Fundamentals: Basic cleaning and normalization
	3. Quantitative Text Analysis: Measuring and comparing text features
	4. Tokenization Approaches: Breaking text into meaningful units
	5. Text Visualization Techniques: Creating insightful visual representations
	6. From Analysis to Insights: Drawing evidence-based conclusions
	""")

	# Module 1: Text as Data
	st.header("Module 1: Text as Data")
	st.write("""
	When we look at text like customer reviews or academic papers, we naturally understand the meaning.
	But how can a computer understand this?

	Key Concept: Text can be treated as data that we can analyze quantitatively.
	Unlike numerical data (age, price, temperature) that has inherent mathematical properties,
	text data needs to be transformed before we can analyze it.
	""")

	# Interactive Example
	st.subheader("Interactive Example: Text Tokenization")
	st.write("Let's try tokenizing some text:")

	example_text = st.text_area(
	"Enter some text to tokenize:",
	"The quick brown fox jumps over the lazy dog."
	)

	if st.button("Tokenize Text"):
	tokens = simple_tokenize(example_text)
	st.write("Tokens:", tokens)
	st.write("Number of tokens:", len(tokens))

	# Module 2: Text Processing
	st.header("Module 2: Text Processing")
	st.write("""
	Before we can analyze text, we need to clean and normalize it. This includes:
	- Converting to lowercase
	- Removing punctuation
	- Removing stop words
	- Basic text normalization
	""")

	# Interactive Text Processing
	st.subheader("Try Text Processing")
	st.write("""
	Let's process some text using different techniques:
	""")

	process_text = st.text_area(
	"Enter text to process:",
	"The quick brown fox jumps over the lazy dog.",
	key="process_text"
	)

	col1, col2 = st.columns(2)

	with col1:
	if st.button("Remove Stop Words"):
	tokens = simple_tokenize(process_text)
	filtered_words = remove_stop_words(tokens)
	st.write("After removing stop words:", filtered_words)

	with col2:
	if st.button("Remove Punctuation"):
	no_punct = process_text.translate(str.maketrans('', '', string.punctuation))
	st.write("After removing punctuation:", no_punct)

	# Module 3: Text Visualization
	st.header("Module 3: Text Visualization")
	st.write("""
	Visual representations help us identify patterns across text data.
	Common visualization methods include:
	- Word clouds
	- Frequency distributions
	- Sentiment over time
	- Topic clusters
	""")

	# Interactive Word Cloud
	st.subheader("Create a Word Cloud")
	st.write("""
	Let's create a word cloud from some text:
	""")

	wordcloud_text = st.text_area(
	"Enter text for word cloud:",
	"The quick brown fox jumps over the lazy dog. The fox is quick and brown. The dog is lazy.",
	key="wordcloud_text"
	)

	if st.button("Generate Word Cloud"):
	# Create and generate a word cloud image
	wordcloud = WordCloud().generate(wordcloud_text)

	# Display the word cloud
	plt.figure(figsize=(10, 6))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	st.pyplot(plt)

	# Practice Exercises
	st.header("Practice Exercises")

	with st.expander("Exercise 1: Text Processing"):
	st.write("""
	1. Load a sample text
	2. Remove stop words and punctuation
	3. Create a word cloud
	4. Analyze word frequencies
	""")

	st.code("""
	# Solution
	from wordcloud import WordCloud
	import string

	# Sample text
	text = "Your text here"

	# Remove punctuation
	text = text.translate(str.maketrans('', '', string.punctuation))

	# Remove stop words
	tokens = text.split()
	filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS]

	# Create word cloud
	wordcloud = WordCloud().generate(' '.join(filtered_words))
	plt.imshow(wordcloud)
	plt.axis('off')
	plt.show()
	""")

	with st.expander("Exercise 2: Text Analysis"):
	st.write("""
	1. Calculate basic text metrics (word count, unique words)
	2. Perform basic text normalization
	3. Compare the results
	4. Visualize the differences
	""")

	st.code("""
	# Solution
	def normalize_text(text):
	# Convert to lowercase
	text = text.lower()
	# Remove punctuation
	text = text.translate(str.maketrans('', '', string.punctuation))
	return text

	# Sample text
	text = "Running, runs, ran, better, good"

	# Normalize text
	normalized = normalize_text(text)
	words = normalized.split()

	# Compare results
	print(f"Original: {text}")
	print(f"Normalized: {normalized}")
	print(f"Word count: {len(words)}")
	print(f"Unique words: {len(set(words))}")
	""")

	username = st.session_state.get("username", "Student")
	st.header(f"{username}'s Weekly Assignment")

	if username == "manxiii":
	st.markdown("""
	Hello manxiii, here is your Assignment 4: Python Basics.
	1. Finish looking for 3 more research papers and add them to your literate review
	2. Finish literate review for the 2 papers you have already summerized
	3. Add the plots from the previous week to the dataset section and add a description
	4. link to your paper here: https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13

	Due Date: End of Week 4
	""")
	elif username == "zhu":
	st.markdown("""
	Hello zhu, here is your Assignment 4: NLP Basics.
	""")
	elif username == "WK":
	st.markdown("""
	Hello WK, here is your Assignment 4: NLP Basics.


	Due Date: End of Week 4
	""")
	else:
	st.markdown(f"""
	Hello {username}, here is your Assignment 4: Python Basics. is not yet released. Please message instructor
	""")