Spaces:

thuonguyenvan
/

Plant-Recognition-with-Q-A-System-Backend

Sleeping

Plant-Recognition-with-Q-A-System-Backend / set_up_supabasedb.sql

Thuong Nguyen

Initial deployment: FastAPI backend with data and Dockerfile

f3cb94f about 1 month ago

8.88 kB

	-- ============================================================================
	-- COMPLETE SUPABASE DATABASE SETUP FOR PLANT RECOGNITION SYSTEM
	-- ============================================================================
	-- This file consolidates all database setup, functions, and maintenance scripts
	-- Run this in Supabase SQL Editor to set up the complete database
	--
	-- Sections:
	-- 1. Initial Setup (Extensions, Tables, Indexes)
	-- 2. Core Functions (Vector Search)
	-- 3. Advanced Functions (Plant Filters, Case-Insensitive Search)
	-- 4. Optimization (Index Tuning)
	-- 5. Maintenance Operations (Cleanup, Clear)
	-- ============================================================================

	-- ============================================================================
	-- SECTION 1: INITIAL DATABASE SETUP
	-- ============================================================================

	-- Enable pgvector extension
	CREATE EXTENSION IF NOT EXISTS vector;

	-- Drop existing objects if recreating (uncomment if needed for clean setup)
	-- DROP FUNCTION IF EXISTS match_hypernodes_combined(vector, float, int, float);
	-- DROP FUNCTION IF EXISTS match_hypernodes_by_value(vector, float, int, text);
	-- DROP FUNCTION IF EXISTS match_hypernodes_by_key(vector, float, int, text);
	-- DROP TABLE IF EXISTS hypernodes CASCADE;

	-- Create hypernodes table with proper schema
	CREATE TABLE IF NOT EXISTS hypernodes (
	id BIGSERIAL PRIMARY KEY,

	-- HyperNode data
	key TEXT NOT NULL,
	value TEXT NOT NULL,

	-- Embeddings (1024 dimensions for Vietnamese_Embedding)
	key_embedding vector(1024),
	value_embedding vector(1024),

	-- Metadata
	plant_name TEXT NOT NULL,
	section TEXT,
	chunk_id INTEGER DEFAULT 0,
	is_chunked BOOLEAN DEFAULT FALSE,

	-- Timestamps
	created_at TIMESTAMP DEFAULT NOW(),
	updated_at TIMESTAMP DEFAULT NOW()
	);


	-- Regular indexes for filtering
	CREATE INDEX IF NOT EXISTS hypernodes_plant_name_idx ON hypernodes(plant_name);
	CREATE INDEX IF NOT EXISTS hypernodes_section_idx ON hypernodes(section);

	-- Case-insensitive plant name index for filtering
	CREATE INDEX IF NOT EXISTS hypernodes_plant_name_lower_idx ON hypernodes(LOWER(plant_name));

	-- ============================================================================
	-- SECTION 3: CORE VECTOR SEARCH FUNCTIONS
	-- ============================================================================

	-- Function: Search by Key Embedding
	-- Supports optional case-insensitive plant name filtering
	CREATE OR REPLACE FUNCTION match_hypernodes_by_key(
	query_embedding vector(1024),
	match_threshold float DEFAULT 0.5,
	match_count int DEFAULT 20,
	filter_plant_name text DEFAULT NULL
	)
	RETURNS TABLE (
	id int,
	key text,
	value text,
	plant_name text,
	section text,
	key_embedding vector(1024),
	value_embedding vector(1024),
	similarity float
	)
	LANGUAGE SQL STABLE
	AS $$
	SELECT
	hypernodes.id,
	hypernodes.key,
	hypernodes.value,
	hypernodes.plant_name,
	hypernodes.section,
	hypernodes.key_embedding,
	hypernodes.value_embedding,
	1 - (hypernodes.key_embedding <=> query_embedding) as similarity
	FROM hypernodes
	WHERE 1 - (hypernodes.key_embedding <=> query_embedding) > match_threshold
	AND (filter_plant_name IS NULL OR LOWER(hypernodes.plant_name) = LOWER(filter_plant_name))
	ORDER BY hypernodes.key_embedding <=> query_embedding
	LIMIT match_count;
	$$;

	-- Function: Search by Value Embedding
	-- Supports optional case-insensitive plant name filtering
	CREATE OR REPLACE FUNCTION match_hypernodes_by_value(
	query_embedding vector(1024),
	match_threshold float DEFAULT 0.5,
	match_count int DEFAULT 20,
	filter_plant_name text DEFAULT NULL
	)
	RETURNS TABLE (
	id int,
	key text,
	value text,
	plant_name text,
	section text,
	key_embedding vector(1024),
	value_embedding vector(1024),
	similarity float
	)
	LANGUAGE SQL STABLE
	AS $$
	SELECT
	hypernodes.id,
	hypernodes.key,
	hypernodes.value,
	hypernodes.plant_name,
	hypernodes.section,
	hypernodes.key_embedding,
	hypernodes.value_embedding,
	1 - (hypernodes.value_embedding <=> query_embedding) as similarity
	FROM hypernodes
	WHERE 1 - (hypernodes.value_embedding <=> query_embedding) > match_threshold
	AND (filter_plant_name IS NULL OR LOWER(hypernodes.plant_name) = LOWER(filter_plant_name))
	ORDER BY hypernodes.value_embedding <=> query_embedding
	LIMIT match_count;
	$$;

	-- Function: Combined Key+Value Search
	-- Weighted combination of key and value similarity
	CREATE OR REPLACE FUNCTION match_hypernodes_combined(
	query_embedding vector(1024),
	match_threshold float DEFAULT 0.5,
	match_count int DEFAULT 20,
	key_weight float DEFAULT 0.5
	)
	RETURNS TABLE (
	id bigint,
	key text,
	value text,
	plant_name text,
	section text,
	similarity float
	)
	LANGUAGE SQL STABLE
	AS $$
	SELECT
	id,
	key,
	value,
	plant_name,
	section,
	(
	key_weight * (1 - (key_embedding <=> query_embedding)) +
	(1 - key_weight) * (1 - (value_embedding <=> query_embedding))
	) as similarity
	FROM hypernodes
	WHERE (
	key_weight * (1 - (key_embedding <=> query_embedding)) +
	(1 - key_weight) * (1 - (value_embedding <=> query_embedding))
	) > match_threshold
	ORDER BY similarity DESC
	LIMIT match_count;
	$$;

	-- ============================================================================
	-- SECTION 4: PERFORMANCE OPTIMIZATION
	-- ============================================================================

	-- Set statement timeout for search functions to prevent long-running queries
	ALTER FUNCTION match_hypernodes_by_key SET statement_timeout = '30s';
	ALTER FUNCTION match_hypernodes_by_value SET statement_timeout = '30s';
	ALTER FUNCTION match_hypernodes_combined SET statement_timeout = '30s';

	-- Update table statistics for query optimizer
	ANALYZE hypernodes;

	-- ============================================================================
	-- SECTION 5: MAINTENANCE OPERATIONS
	-- ============================================================================

	-- These are optional maintenance commands. Uncomment to use when needed.

	-- -----------------------------------------------------------------------------
	-- 5.1 Check Database Status
	-- -----------------------------------------------------------------------------

	-- Check total number of nodes
	-- SELECT COUNT(*) as total_nodes FROM hypernodes;

	-- Check data distribution
	-- SELECT plant_name, COUNT(*) as node_count
	-- FROM hypernodes
	-- GROUP BY plant_name
	-- ORDER BY node_count DESC
	-- LIMIT 20;

	-- -----------------------------------------------------------------------------
	-- 5.2 Clean Duplicate Nodes (Run only if needed)
	-- -----------------------------------------------------------------------------

	-- Check for duplicates first
	-- SELECT COUNT(*) as duplicate_count
	-- FROM (
	-- SELECT key, value, plant_name, COUNT(*) as cnt
	-- FROM hypernodes
	-- GROUP BY key, value, plant_name
	-- HAVING COUNT(*) > 1
	-- ) AS duplicates;

	-- Delete duplicates (keeps lowest ID)
	-- DELETE FROM hypernodes a
	-- USING hypernodes b
	-- WHERE a.id > b.id
	-- AND a.key = b.key
	-- AND a.value = b.value
	-- AND a.plant_name = b.plant_name;

	-- -----------------------------------------------------------------------------
	-- 5.3 Clear All Data (USE WITH CAUTION!)
	-- -----------------------------------------------------------------------------

	-- Truncate all data and reset auto-increment
	-- WARNING: This deletes ALL data!
	-- TRUNCATE TABLE hypernodes;

	-- Verify deletion
	-- SELECT COUNT(*) as remaining_nodes FROM hypernodes;

	-- -----------------------------------------------------------------------------
	-- 5.4 Rebuild Indexes (Run after large data changes)
	-- -----------------------------------------------------------------------------

	-- Rebuild vector indexes
	-- REINDEX INDEX hypernodes_key_embedding_idx;
	-- REINDEX INDEX hypernodes_value_embedding_idx;

	-- Rebuild regular indexes
	-- REINDEX INDEX hypernodes_plant_name_idx;
	-- REINDEX INDEX hypernodes_section_idx;
	-- REINDEX INDEX hypernodes_plant_name_lower_idx;

	-- Update statistics after rebuild
	-- ANALYZE hypernodes;

	-- ============================================================================
	-- SETUP COMPLETE!
	-- ============================================================================
	-- Next steps:
	-- 1. Import your plant hypernode data using the import scripts
	-- 2. Verify data with: SELECT COUNT(*) FROM hypernodes;
	-- 3. Test vector search functions with sample queries
	-- ============================================================================