Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from wordcloud import WordCloud | |
| import string | |
| import io | |
| from contextlib import redirect_stdout | |
| import re | |
| # Define a simple list of common English stop words | |
| STOP_WORDS = { | |
| 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', | |
| 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', | |
| 'will', 'with', 'the', 'this', 'but', 'they', 'have', 'had', 'what', 'when', | |
| 'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each', 'few', | |
| 'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'can', 'will', | |
| 'just', 'should', 'now' | |
| } | |
| def simple_tokenize(text): | |
| """Simple tokenization function that splits on whitespace and removes punctuation""" | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove punctuation | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| # Split on whitespace | |
| return text.split() | |
| def remove_stop_words(tokens): | |
| """Remove stop words from a list of tokens""" | |
| return [word for word in tokens if word.lower() not in STOP_WORDS] | |
| def show(): | |
| st.title("Week 4: Introduction to Natural Language Processing") | |
| # Introduction Section | |
| st.header("Course Overview") | |
| st.write(""" | |
| In this course, you'll learn fundamental Natural Language Processing (NLP) concepts by exploring a fascinating real-world question: | |
| What is the effect of releasing a preprint of a paper before it is submitted for peer review? | |
| Using the ICLR (International Conference on Learning Representations) database - which contains submissions, reviews, and author profiles | |
| from 2017-2022 - you'll develop practical NLP skills while investigating potential biases and patterns in academic publishing. | |
| """) | |
| # Learning Path | |
| st.subheader("Learning Path") | |
| st.write(""" | |
| 1. Understanding Text as Data: How computers represent and work with text | |
| 2. Text Processing Fundamentals: Basic cleaning and normalization | |
| 3. Quantitative Text Analysis: Measuring and comparing text features | |
| 4. Tokenization Approaches: Breaking text into meaningful units | |
| 5. Text Visualization Techniques: Creating insightful visual representations | |
| 6. From Analysis to Insights: Drawing evidence-based conclusions | |
| """) | |
| # Module 1: Text as Data | |
| st.header("Module 1: Text as Data") | |
| st.write(""" | |
| When we look at text like customer reviews or academic papers, we naturally understand the meaning. | |
| But how can a computer understand this? | |
| Key Concept: Text can be treated as data that we can analyze quantitatively. | |
| Unlike numerical data (age, price, temperature) that has inherent mathematical properties, | |
| text data needs to be transformed before we can analyze it. | |
| """) | |
| # Interactive Example | |
| st.subheader("Interactive Example: Text Tokenization") | |
| st.write("Let's try tokenizing some text:") | |
| example_text = st.text_area( | |
| "Enter some text to tokenize:", | |
| "The quick brown fox jumps over the lazy dog." | |
| ) | |
| if st.button("Tokenize Text"): | |
| tokens = simple_tokenize(example_text) | |
| st.write("Tokens:", tokens) | |
| st.write("Number of tokens:", len(tokens)) | |
| # Module 2: Text Processing | |
| st.header("Module 2: Text Processing") | |
| st.write(""" | |
| Before we can analyze text, we need to clean and normalize it. This includes: | |
| - Converting to lowercase | |
| - Removing punctuation | |
| - Removing stop words | |
| - Basic text normalization | |
| """) | |
| # Interactive Text Processing | |
| st.subheader("Try Text Processing") | |
| st.write(""" | |
| Let's process some text using different techniques: | |
| """) | |
| process_text = st.text_area( | |
| "Enter text to process:", | |
| "The quick brown fox jumps over the lazy dog.", | |
| key="process_text" | |
| ) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if st.button("Remove Stop Words"): | |
| tokens = simple_tokenize(process_text) | |
| filtered_words = remove_stop_words(tokens) | |
| st.write("After removing stop words:", filtered_words) | |
| with col2: | |
| if st.button("Remove Punctuation"): | |
| no_punct = process_text.translate(str.maketrans('', '', string.punctuation)) | |
| st.write("After removing punctuation:", no_punct) | |
| # Module 3: Text Visualization | |
| st.header("Module 3: Text Visualization") | |
| st.write(""" | |
| Visual representations help us identify patterns across text data. | |
| Common visualization methods include: | |
| - Word clouds | |
| - Frequency distributions | |
| - Sentiment over time | |
| - Topic clusters | |
| """) | |
| # Interactive Word Cloud | |
| st.subheader("Create a Word Cloud") | |
| st.write(""" | |
| Let's create a word cloud from some text: | |
| """) | |
| wordcloud_text = st.text_area( | |
| "Enter text for word cloud:", | |
| "The quick brown fox jumps over the lazy dog. The fox is quick and brown. The dog is lazy.", | |
| key="wordcloud_text" | |
| ) | |
| if st.button("Generate Word Cloud"): | |
| # Create and generate a word cloud image | |
| wordcloud = WordCloud().generate(wordcloud_text) | |
| # Display the word cloud | |
| plt.figure(figsize=(10, 6)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| st.pyplot(plt) | |
| # Practice Exercises | |
| st.header("Practice Exercises") | |
| with st.expander("Exercise 1: Text Processing"): | |
| st.write(""" | |
| 1. Load a sample text | |
| 2. Remove stop words and punctuation | |
| 3. Create a word cloud | |
| 4. Analyze word frequencies | |
| """) | |
| st.code(""" | |
| # Solution | |
| from wordcloud import WordCloud | |
| import string | |
| # Sample text | |
| text = "Your text here" | |
| # Remove punctuation | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| # Remove stop words | |
| tokens = text.split() | |
| filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS] | |
| # Create word cloud | |
| wordcloud = WordCloud().generate(' '.join(filtered_words)) | |
| plt.imshow(wordcloud) | |
| plt.axis('off') | |
| plt.show() | |
| """) | |
| with st.expander("Exercise 2: Text Analysis"): | |
| st.write(""" | |
| 1. Calculate basic text metrics (word count, unique words) | |
| 2. Perform basic text normalization | |
| 3. Compare the results | |
| 4. Visualize the differences | |
| """) | |
| st.code(""" | |
| # Solution | |
| def normalize_text(text): | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove punctuation | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| return text | |
| # Sample text | |
| text = "Running, runs, ran, better, good" | |
| # Normalize text | |
| normalized = normalize_text(text) | |
| words = normalized.split() | |
| # Compare results | |
| print(f"Original: {text}") | |
| print(f"Normalized: {normalized}") | |
| print(f"Word count: {len(words)}") | |
| print(f"Unique words: {len(set(words))}") | |
| """) | |
| username = st.session_state.get("username", "Student") | |
| st.header(f"{username}'s Weekly Assignment") | |
| if username == "manxiii": | |
| st.markdown(""" | |
| Hello **manxiii**, here is your Assignment 4: Python Basics. | |
| 1. Finish looking for 3 more research papers and add them to your literate review | |
| 2. Finish literate review for the 2 papers you have already summerized | |
| 3. Add the plots from the previous week to the dataset section and add a description | |
| 4. link to your paper here: https://www.overleaf.com/project/68228f4ccb9d18d92c26ba13 | |
| **Due Date:** End of Week 4 | |
| """) | |
| elif username == "zhu": | |
| st.markdown(""" | |
| Hello **zhu**, here is your Assignment 4: NLP Basics. | |
| """) | |
| elif username == "WK": | |
| st.markdown(""" | |
| Hello **WK**, here is your Assignment 4: NLP Basics. | |
| **Due Date:** End of Week 4 | |
| """) | |
| else: | |
| st.markdown(f""" | |
| Hello **{username}**, here is your Assignment 4: Python Basics. is not yet released. Please message instructor | |
| """) |