Coverage for src/qdrant_loader/core/state/models.py: 99%

96 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1""" 

2SQLAlchemy models for state management database. 

3""" 

4 

5from datetime import UTC 

6 

7from sqlalchemy import ( 

8 Boolean, 

9 Column, 

10 Float, 

11 ForeignKey, 

12 Index, 

13 Integer, 

14 String, 

15 Text, 

16 TypeDecorator, 

17 UniqueConstraint, 

18) 

19from sqlalchemy import DateTime as SQLDateTime 

20from sqlalchemy.orm import declarative_base, relationship 

21 

22from qdrant_loader.utils.logging import LoggingConfig 

23 

24logger = LoggingConfig.get_logger(__name__) 

25 

26 

27class UTCDateTime(TypeDecorator): 

28 """Automatically handle timezone information for datetime columns.""" 

29 

30 impl = SQLDateTime 

31 cache_ok = True 

32 

33 def process_bind_param(self, value, _dialect): 

34 if value is not None: 

35 if not value.tzinfo: 

36 value = value.replace(tzinfo=UTC) 

37 return value 

38 

39 def process_result_value(self, value, _dialect): 

40 if value is not None: 

41 if not value.tzinfo: 

42 value = value.replace(tzinfo=UTC) 

43 return value 

44 

45 

46Base = declarative_base() 

47 

48 

49class Project(Base): 

50 """Tracks project metadata and configuration.""" 

51 

52 __tablename__ = "projects" 

53 

54 id = Column(String, primary_key=True) # Project identifier 

55 display_name = Column(String, nullable=False) # Human-readable project name 

56 description = Column(Text, nullable=True) # Project description 

57 collection_name = Column(String, nullable=False) # QDrant collection name 

58 config_hash = Column(String, nullable=True) # Hash of project configuration 

59 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

60 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

61 

62 # Relationships 

63 sources = relationship( 

64 "ProjectSource", back_populates="project", cascade="all, delete-orphan" 

65 ) 

66 ingestion_histories = relationship( 

67 "IngestionHistory", back_populates="project", cascade="all, delete-orphan" 

68 ) 

69 document_states = relationship( 

70 "DocumentStateRecord", back_populates="project", cascade="all, delete-orphan" 

71 ) 

72 

73 __table_args__ = ( 

74 UniqueConstraint("collection_name", name="uix_project_collection"), 

75 Index("ix_project_display_name", "display_name"), 

76 ) 

77 

78 

79class ProjectSource(Base): 

80 """Tracks project-specific source configurations and status.""" 

81 

82 __tablename__ = "project_sources" 

83 

84 id = Column(Integer, primary_key=True, autoincrement=True) 

85 project_id = Column( 

86 String, ForeignKey("projects.id", ondelete="CASCADE"), nullable=False 

87 ) 

88 source_type = Column(String, nullable=False) # git, confluence, jira, etc. 

89 source_name = Column(String, nullable=False) # Source identifier within project 

90 config_hash = Column(String, nullable=True) # Hash of source configuration 

91 last_sync_time = Column( 

92 UTCDateTime(timezone=True), nullable=True 

93 ) # Last successful sync 

94 status = Column( 

95 String, default="pending", nullable=False 

96 ) # pending, syncing, completed, error 

97 error_message = Column(Text, nullable=True) # Last error message if any 

98 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

99 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

100 

101 # Relationships 

102 project = relationship("Project", back_populates="sources") 

103 

104 __table_args__ = ( 

105 UniqueConstraint( 

106 "project_id", "source_type", "source_name", name="uix_project_source" 

107 ), 

108 Index("ix_project_source_status", "status"), 

109 Index("ix_project_source_type", "source_type"), 

110 ) 

111 

112 

113class IngestionHistory(Base): 

114 """Tracks ingestion history for each source.""" 

115 

116 __tablename__ = "ingestion_history" 

117 

118 id = Column(Integer, primary_key=True, autoincrement=True) 

119 project_id = Column( 

120 String, ForeignKey("projects.id", ondelete="CASCADE"), nullable=True 

121 ) # Nullable for backward compatibility 

122 source_type = Column(String, nullable=False) 

123 source = Column(String, nullable=False) 

124 last_successful_ingestion = Column(UTCDateTime(timezone=True), nullable=False) 

125 status = Column(String, nullable=False) 

126 document_count = Column(Integer, default=0) 

127 error_message = Column(String) 

128 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

129 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

130 

131 # File conversion metrics 

132 converted_files_count = Column(Integer, default=0) 

133 conversion_failures_count = Column(Integer, default=0) 

134 attachments_processed_count = Column(Integer, default=0) 

135 total_conversion_time = Column(Float, default=0.0) 

136 

137 # Relationships 

138 project = relationship("Project", back_populates="ingestion_histories") 

139 

140 __table_args__ = ( 

141 UniqueConstraint( 

142 "project_id", "source_type", "source", name="uix_project_source_ingestion" 

143 ), 

144 Index("ix_ingestion_project_id", "project_id"), 

145 ) 

146 

147 

148class DocumentStateRecord(Base): 

149 """Tracks the state of individual documents.""" 

150 

151 __tablename__ = "document_states" 

152 

153 id = Column(Integer, primary_key=True, autoincrement=True) 

154 project_id = Column( 

155 String, ForeignKey("projects.id", ondelete="CASCADE"), nullable=True 

156 ) # Nullable for backward compatibility 

157 document_id = Column(String, nullable=False) 

158 source_type = Column(String, nullable=False) 

159 source = Column(String, nullable=False) 

160 url = Column(String, nullable=False) 

161 title = Column(String, nullable=False) 

162 content_hash = Column(String, nullable=False) 

163 is_deleted = Column(Boolean, default=False) 

164 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

165 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

166 

167 # File conversion metadata 

168 is_converted = Column(Boolean, default=False) 

169 conversion_method = Column( 

170 String, nullable=True 

171 ) # 'markitdown', 'markitdown_fallback', etc. 

172 original_file_type = Column( 

173 String, nullable=True 

174 ) # Original file extension/MIME type 

175 original_filename = Column(String, nullable=True) # Original filename 

176 file_size = Column(Integer, nullable=True) # File size in bytes 

177 conversion_failed = Column(Boolean, default=False) 

178 conversion_error = Column(Text, nullable=True) # Error message if conversion failed 

179 conversion_time = Column( 

180 Float, nullable=True 

181 ) # Time taken for conversion in seconds 

182 

183 # Attachment metadata 

184 is_attachment = Column(Boolean, default=False) 

185 parent_document_id = Column( 

186 String, nullable=True 

187 ) # ID of parent document for attachments 

188 attachment_id = Column(String, nullable=True) # Unique attachment identifier 

189 attachment_filename = Column(String, nullable=True) # Original attachment filename 

190 attachment_mime_type = Column(String, nullable=True) # MIME type of attachment 

191 attachment_download_url = Column(String, nullable=True) # Original download URL 

192 attachment_author = Column(String, nullable=True) # Author of attachment 

193 attachment_created_at = Column( 

194 UTCDateTime(timezone=True), nullable=True 

195 ) # Attachment creation date 

196 

197 # Relationships 

198 project = relationship("Project", back_populates="document_states") 

199 

200 __table_args__ = ( 

201 UniqueConstraint( 

202 "project_id", 

203 "source_type", 

204 "source", 

205 "document_id", 

206 name="uix_project_document", 

207 ), 

208 Index("ix_document_url", "url"), 

209 Index("ix_document_converted", "is_converted"), 

210 Index("ix_document_attachment", "is_attachment"), 

211 Index("ix_document_parent", "parent_document_id"), 

212 Index("ix_document_conversion_method", "conversion_method"), 

213 Index("ix_document_project_id", "project_id"), 

214 )