File size: 3,491 Bytes
42f5b98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Chunk entity model for semantic code units."""

from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
from uuid import uuid4


class ChunkType(str, Enum):
    """Type of code chunk."""

    FUNCTION = "function"
    CLASS = "class"
    METHOD = "method"
    MODULE = "module"
    TEXT = "text"
    DOCSTRING = "docstring"
    COMMENT = "comment"


@dataclass
class ChunkMetadata:
    """Metadata for a code chunk."""

    file_path: str
    start_line: int
    end_line: int
    chunk_type: ChunkType
    language: Optional[str] = None
    name: Optional[str] = None
    signature: Optional[str] = None
    docstring: Optional[str] = None
    parent_name: Optional[str] = None

    @property
    def line_range(self) -> str:
        """Get line range as string."""
        return f"{self.start_line}-{self.end_line}"

    @property
    def citation(self) -> str:
        """Get citation format."""
        return f"[{self.file_path}:{self.start_line}-{self.end_line}]"


@dataclass
class Chunk:
    """A semantic unit of code or documentation."""

    content: str
    metadata: ChunkMetadata
    repo_id: str
    id: str = field(default_factory=lambda: str(uuid4()))
    embedding: Optional[list[float]] = None

    @property
    def file_path(self) -> str:
        """Convenience accessor for file path."""
        return self.metadata.file_path

    @property
    def start_line(self) -> int:
        """Convenience accessor for start line."""
        return self.metadata.start_line

    @property
    def end_line(self) -> int:
        """Convenience accessor for end line."""
        return self.metadata.end_line

    @property
    def chunk_type(self) -> ChunkType:
        """Convenience accessor for chunk type."""
        return self.metadata.chunk_type

    @property
    def name(self) -> Optional[str]:
        """Convenience accessor for name."""
        return self.metadata.name

    @property
    def citation(self) -> str:
        """Get citation format."""
        return self.metadata.citation

    def to_dict(self) -> dict:
        """Convert to dictionary for storage."""
        return {
            "id": self.id,
            "content": self.content,
            "repo_id": self.repo_id,
            "file_path": self.metadata.file_path,
            "start_line": self.metadata.start_line,
            "end_line": self.metadata.end_line,
            "chunk_type": self.metadata.chunk_type.value,
            "language": self.metadata.language,
            "name": self.metadata.name,
            "signature": self.metadata.signature,
            "docstring": self.metadata.docstring,
            "parent_name": self.metadata.parent_name,
        }

    @classmethod
    def from_dict(cls, data: dict, embedding: Optional[list[float]] = None) -> "Chunk":
        """Create from dictionary."""
        metadata = ChunkMetadata(
            file_path=data["file_path"],
            start_line=data["start_line"],
            end_line=data["end_line"],
            chunk_type=ChunkType(data["chunk_type"]),
            language=data.get("language"),
            name=data.get("name"),
            signature=data.get("signature"),
            docstring=data.get("docstring"),
            parent_name=data.get("parent_name"),
        )
        return cls(
            id=data["id"],
            content=data["content"],
            metadata=metadata,
            repo_id=data["repo_id"],
            embedding=embedding,
        )