Coverage for src/qdrant_loader/core/state/models.py: 99%

96 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:50 +0000

1""" 

2SQLAlchemy models for state management database. 

3""" 

4 

5from datetime import UTC 

6 

7from sqlalchemy import ( 

8 Boolean, 

9 Column, 

10 Index, 

11 Integer, 

12 String, 

13 TypeDecorator, 

14 UniqueConstraint, 

15 Text, 

16 Float, 

17 ForeignKey, 

18) 

19from sqlalchemy import DateTime as SQLDateTime 

20from sqlalchemy.orm import declarative_base, relationship 

21 

22from qdrant_loader.utils.logging import LoggingConfig 

23 

24logger = LoggingConfig.get_logger(__name__) 

25 

26 

27class UTCDateTime(TypeDecorator): 

28 """Automatically handle timezone information for datetime columns.""" 

29 

30 impl = SQLDateTime 

31 cache_ok = True 

32 

33 def process_bind_param(self, value, dialect): 

34 if value is not None: 

35 if not value.tzinfo: 

36 value = value.replace(tzinfo=UTC) 

37 return value 

38 

39 def process_result_value(self, value, dialect): 

40 if value is not None: 

41 if not value.tzinfo: 

42 value = value.replace(tzinfo=UTC) 

43 return value 

44 

45 

46Base = declarative_base() 

47 

48 

49class Project(Base): 

50 """Tracks project metadata and configuration.""" 

51 

52 __tablename__ = "projects" 

53 

54 id = Column(String, primary_key=True) # Project identifier 

55 display_name = Column(String, nullable=False) # Human-readable project name 

56 description = Column(Text, nullable=True) # Project description 

57 collection_name = Column(String, nullable=False) # QDrant collection name 

58 config_hash = Column(String, nullable=True) # Hash of project configuration 

59 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

60 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

61 

62 # Relationships 

63 sources = relationship( 

64 "ProjectSource", back_populates="project", cascade="all, delete-orphan" 

65 ) 

66 ingestion_histories = relationship( 

67 "IngestionHistory", back_populates="project", cascade="all, delete-orphan" 

68 ) 

69 document_states = relationship( 

70 "DocumentStateRecord", back_populates="project", cascade="all, delete-orphan" 

71 ) 

72 

73 __table_args__ = ( 

74 UniqueConstraint("collection_name", name="uix_project_collection"), 

75 Index("ix_project_display_name", "display_name"), 

76 ) 

77 

78 

79class ProjectSource(Base): 

80 """Tracks project-specific source configurations and status.""" 

81 

82 __tablename__ = "project_sources" 

83 

84 id = Column(Integer, primary_key=True, autoincrement=True) 

85 project_id = Column( 

86 String, ForeignKey("projects.id", ondelete="CASCADE"), nullable=False 

87 ) 

88 source_type = Column(String, nullable=False) # git, confluence, jira, etc. 

89 source_name = Column(String, nullable=False) # Source identifier within project 

90 config_hash = Column(String, nullable=True) # Hash of source configuration 

91 last_sync_time = Column( 

92 UTCDateTime(timezone=True), nullable=True 

93 ) # Last successful sync 

94 status = Column( 

95 String, default="pending", nullable=False 

96 ) # pending, syncing, completed, error 

97 error_message = Column(Text, nullable=True) # Last error message if any 

98 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

99 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

100 

101 # Relationships 

102 project = relationship("Project", back_populates="sources") 

103 

104 __table_args__ = ( 

105 UniqueConstraint( 

106 "project_id", "source_type", "source_name", name="uix_project_source" 

107 ), 

108 Index("ix_project_source_status", "status"), 

109 Index("ix_project_source_type", "source_type"), 

110 ) 

111 

112 

113class IngestionHistory(Base): 

114 """Tracks ingestion history for each source.""" 

115 

116 __tablename__ = "ingestion_history" 

117 

118 id = Column(Integer, primary_key=True, autoincrement=True) 

119 project_id = Column( 

120 String, ForeignKey("projects.id", ondelete="CASCADE"), nullable=True 

121 ) # Nullable for backward compatibility 

122 source_type = Column(String, nullable=False) 

123 source = Column(String, nullable=False) 

124 last_successful_ingestion = Column(UTCDateTime(timezone=True), nullable=False) 

125 status = Column(String, nullable=False) 

126 document_count = Column(Integer, default=0) 

127 error_message = Column(String) 

128 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

129 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

130 

131 # File conversion metrics 

132 converted_files_count = Column(Integer, default=0) 

133 conversion_failures_count = Column(Integer, default=0) 

134 attachments_processed_count = Column(Integer, default=0) 

135 total_conversion_time = Column(Float, default=0.0) 

136 

137 # Relationships 

138 project = relationship("Project", back_populates="ingestion_histories") 

139 

140 __table_args__ = ( 

141 UniqueConstraint( 

142 "project_id", "source_type", "source", name="uix_project_source_ingestion" 

143 ), 

144 # Keep legacy constraint for backward compatibility 

145 UniqueConstraint("source_type", "source", name="uix_source"), 

146 Index("ix_ingestion_project_id", "project_id"), 

147 ) 

148 

149 

150class DocumentStateRecord(Base): 

151 """Tracks the state of individual documents.""" 

152 

153 __tablename__ = "document_states" 

154 

155 id = Column(Integer, primary_key=True, autoincrement=True) 

156 project_id = Column( 

157 String, ForeignKey("projects.id", ondelete="CASCADE"), nullable=True 

158 ) # Nullable for backward compatibility 

159 document_id = Column(String, nullable=False) 

160 source_type = Column(String, nullable=False) 

161 source = Column(String, nullable=False) 

162 url = Column(String, nullable=False) 

163 title = Column(String, nullable=False) 

164 content_hash = Column(String, nullable=False) 

165 is_deleted = Column(Boolean, default=False) 

166 created_at = Column(UTCDateTime(timezone=True), nullable=False) 

167 updated_at = Column(UTCDateTime(timezone=True), nullable=False) 

168 

169 # File conversion metadata 

170 is_converted = Column(Boolean, default=False) 

171 conversion_method = Column( 

172 String, nullable=True 

173 ) # 'markitdown', 'markitdown_fallback', etc. 

174 original_file_type = Column( 

175 String, nullable=True 

176 ) # Original file extension/MIME type 

177 original_filename = Column(String, nullable=True) # Original filename 

178 file_size = Column(Integer, nullable=True) # File size in bytes 

179 conversion_failed = Column(Boolean, default=False) 

180 conversion_error = Column(Text, nullable=True) # Error message if conversion failed 

181 conversion_time = Column( 

182 Float, nullable=True 

183 ) # Time taken for conversion in seconds 

184 

185 # Attachment metadata 

186 is_attachment = Column(Boolean, default=False) 

187 parent_document_id = Column( 

188 String, nullable=True 

189 ) # ID of parent document for attachments 

190 attachment_id = Column(String, nullable=True) # Unique attachment identifier 

191 attachment_filename = Column(String, nullable=True) # Original attachment filename 

192 attachment_mime_type = Column(String, nullable=True) # MIME type of attachment 

193 attachment_download_url = Column(String, nullable=True) # Original download URL 

194 attachment_author = Column(String, nullable=True) # Author of attachment 

195 attachment_created_at = Column( 

196 UTCDateTime(timezone=True), nullable=True 

197 ) # Attachment creation date 

198 

199 # Relationships 

200 project = relationship("Project", back_populates="document_states") 

201 

202 __table_args__ = ( 

203 UniqueConstraint( 

204 "project_id", 

205 "source_type", 

206 "source", 

207 "document_id", 

208 name="uix_project_document", 

209 ), 

210 # Keep legacy constraint for backward compatibility 

211 UniqueConstraint("source_type", "source", "document_id", name="uix_document"), 

212 Index("ix_document_url", "url"), 

213 Index("ix_document_converted", "is_converted"), 

214 Index("ix_document_attachment", "is_attachment"), 

215 Index("ix_document_parent", "parent_document_id"), 

216 Index("ix_document_conversion_method", "conversion_method"), 

217 Index("ix_document_project_id", "project_id"), 

218 )