content_system.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. """
  2. A registry for content sources that work in terms of the View Model (view_model.py).
  3. Generally a source returns a CollectionPage or individual items.
  4. At present many sources return a List of Maps because the design is being discovered and solidified as it makes sense rather than big design up front.
  5. May end up similar to Android's ContentProvider, found later. I was
  6. thinking about using content:// URI scheme.
  7. https://developer.android.com/reference/android/content/ContentProvider
  8. Could also be similar to a Coccoon Generator
  9. https://cocoon.apache.org/1363_1_1.html
  10. Later processing in Python:
  11. https://www.innuy.com/blog/build-data-pipeline-python/
  12. https://www.bonobo-project.org/
  13. """
  14. import re
  15. import inspect
  16. from functools import lru_cache
  17. import time
  18. def get_ttl_hash(seconds=3600):
  19. """Return the same value withing `seconds` time period"""
  20. return round(time.time() / seconds)
  21. class ContentSystem:
  22. def __init__ (self):
  23. self.content_sources = {}
  24. self.hooks = {}
  25. def register_content_source (self, id_prefix, content_source_fn, id_pattern='(\d+)', source_id=None, weight=None):
  26. if not source_id:
  27. source_id=f'{inspect.getmodule(content_source_fn).__name__}:{content_source_fn.__name__}'
  28. if weight == None:
  29. weight = 1000 - len(self.content_sources)
  30. print(f'register_content_source: {id_prefix}: {source_id} with ID pattern {id_pattern} (weight={weight})')
  31. self.content_sources[ source_id ] = [id_prefix, content_source_fn, id_pattern, source_id, weight]
  32. @lru_cache(maxsize=1024) # NOTE: mutating return value mutates cached value
  33. def find_content_id_args (self, id_pattern, content_id):
  34. id_args = re.fullmatch(id_pattern, content_id)
  35. if not id_args:
  36. return [], {}
  37. args = []
  38. kwargs = id_args.groupdict()
  39. if not kwargs:
  40. args = id_args.groups()
  41. return args, kwargs
  42. def resolve_content_source (self, content_id, content_source_id=None, *extra_args, **extra_kwargs):
  43. """
  44. Resolve possible content sources given the parameters used for get_content.
  45. Returns a generator, typically the first will be used.
  46. Allows for content modules to determine if sources are available,
  47. without fetching the content itself.
  48. """
  49. #source_ids = list(self.content_sources.keys())
  50. #source_ids.sort(key=lambda id_prefix: len(id_prefix), reverse=True)
  51. source_ids = list(self.content_sources.keys())
  52. source_ids.sort(key=lambda id_prefix: self.content_sources[id_prefix][4], reverse=True) # 4 = weight
  53. #print(source_ids)
  54. for source_id in source_ids:
  55. if content_source_id and source_id != content_source_id:
  56. continue
  57. [id_prefix, content_source_fn, id_pattern, source_id, weight] = self.content_sources[ source_id ]
  58. if not content_id.startswith(id_prefix):
  59. continue
  60. source_content_id = content_id[len(id_prefix):]
  61. # HACK
  62. if not id_prefix.endswith(':') and source_content_id:
  63. continue
  64. print(f'get_content (id={source_content_id}) from source {source_id}, resolves to {source_content_id} ( weight={weight})')
  65. args, kwargs = self.find_content_id_args(id_pattern, source_content_id)
  66. # HACK
  67. if id_prefix.endswith(':') and not args and not kwargs:
  68. continue
  69. if extra_args:
  70. args += extra_args
  71. if extra_kwargs:
  72. kwargs = {**extra_kwargs, **kwargs}
  73. # if we're calling a bulk source and only get back partial results...
  74. # we'd want to remove the found content IDs and merge until
  75. # we find them all...
  76. # yet we don't want intelligence about the type of content returned.
  77. # idea: class BulkResponse(dict): pass
  78. yield content_source_fn, args, kwargs
  79. #@lru_cache(maxsize=64)
  80. def get_content (self, content_id, content_source_id=None, ttl_hash=get_ttl_hash(60), *extra_args, **extra_kwargs):
  81. """
  82. NOTE: mutating return value mutates cached value
  83. """
  84. print(f'get_content {content_id}')
  85. for content_source_fn, args, kwargs in self.resolve_content_source(
  86. content_id,
  87. content_source_id=content_source_id,
  88. *extra_args,
  89. **extra_kwargs):
  90. content = content_source_fn(*args, **kwargs)
  91. if content:
  92. self.invoke_hooks('got_content', content_id, content)
  93. return content
  94. #@lru_cache(maxsize=8) # NOTE: mutating return value mutates cached value
  95. def get_all_content (self, content_ids, enable_bulk_fetch=False, ttl_hash=get_ttl_hash(60)):
  96. """
  97. Get content from all sources, using a grouping call if possible.
  98. Returns a map of source_id to to result; the caller needs
  99. to have the intelligence to merge and paginate.
  100. Native implementation is to juse make one call to get_content per ID,
  101. but we need to figure out a way to pass a list of IDs and pagination
  102. per source; for exampe a list of 100+ Tweet IDs and 100+ YT videos
  103. from a Swipe file.
  104. """
  105. return self.get_all_content2(content_ids, enable_bulk_fetch=enable_bulk_fetch)
  106. def get_all_content2 (self, content_collection_ids, content_args = None, enable_bulk_fetch=False):
  107. """
  108. Takes a list of collection IDs and content_args is a map of (args, kwargs) keyed by collection ID.
  109. We could just use keys from content_args with empty values but that's a little confusing.
  110. Interleaving the next page of a source into existing results is a problem.
  111. Gracefully degraded could simply get the next page at the end of all pages and then
  112. view older content.
  113. We also need intelligence about content types, meaning perhaps some lambdas pass in.
  114. Eg. CollectionPage.
  115. See feeds facade for an example of merging one page.
  116. Seems like keeping feed items in a DB is becoming the way to go, serving things in order.
  117. Client side content merging might work to insert nodes above, eg. HTMx.
  118. Might be jarring to reader, so make optional. Append all new or merge.
  119. Cache feed between requests on disk, merge in memory, send merge/append result.
  120. """
  121. bulk_prefixes = {
  122. #'twitter:tweet:': 'twitter:tweets',
  123. #'youtube:video:': 'youtube:videos',
  124. }
  125. bulk_requests = {}
  126. result = {}
  127. for content_id in content_collection_ids:
  128. is_bulk = False
  129. if enable_bulk_fetch:
  130. for bulk_prefix in bulk_prefixes:
  131. if content_id.startswith(bulk_prefix):
  132. bulk_content_id = bulk_prefixes[ bulk_prefix ]
  133. if not bulk_content_id in bulk_requests:
  134. bulk_requests[ bulk_content_id ] = []
  135. bulk_requests[ bulk_content_id ].append(content_id)
  136. # max size for a content source...
  137. is_bulk = True
  138. if is_bulk:
  139. continue
  140. if content_args and content_id in content_args:
  141. extra_args, extra_kwargs = content_args[content_id]
  142. else:
  143. extra_args, extra_kwargs = [], {}
  144. result[ content_id ] = self.get_content(content_id, *extra_args, **extra_kwargs)
  145. for bulk_content_id, content_ids in bulk_requests.items():
  146. print(f'bulk: {bulk_content_id}, content_ids: {content_ids}')
  147. bulk_response = self.get_content(bulk_content_id, content_ids=tuple(content_ids)) # FIXME me=... workaround, provide bulk id in args map
  148. print(f'bulk_response: {bulk_response}')
  149. # we're not supposed to be smart about get_content response type...
  150. # does it return a map by convention? better than iterating something else.
  151. if bulk_response:
  152. for content_id, content in bulk_response.items():
  153. if content:
  154. self.invoke_hooks('got_content', content_id, content)
  155. result.update(bulk_response)
  156. return result
  157. def register_hook (self, hook_type, hook_fn, *extra_args, **extra_kwargs):
  158. if not hook_type in self.hooks:
  159. self.hooks[hook_type] = []
  160. self.hooks[hook_type].append([hook_fn, extra_args, extra_kwargs])
  161. def invoke_hooks (self, hook_type, *args, **kwargs):
  162. if not hook_type in self.hooks:
  163. return
  164. for hook, extra_args, extra_kwargs in self.hooks[hook_type]:
  165. hook_args = args
  166. hook_kwargs = kwargs
  167. if extra_args:
  168. hook_args = args + extra_args
  169. if extra_kwargs:
  170. hook_kwargs = {**extra_kwargs, **hook_kwargs}
  171. hook(*hook_args, **hook_kwargs)
  172. #try:
  173. # hook(*args, **kwargs)
  174. #except TypeError as e:
  175. # print ('tried to call a hook with wrong args. no problem')
  176. # continue
  177. class ObjectCache:
  178. create_stmt = """
  179. create table content (
  180. provider text,
  181. id text,
  182. dt datetime,
  183. args text, -- could hash
  184. type text,
  185. data blob,
  186. unique (provider, id, dt, args)
  187. )
  188. """
  189. insert_stmt = """
  190. INSERT INTO content (dt, provider, id, args, type, data)
  191. VALUES (current_timestamp, ?, ?, ?, ?, ?)
  192. """
  193. select_latest_stmt = """
  194. SELECT * from content
  195. WHERE {where_sql}
  196. GROUP BY provider, id, dt, args
  197. HAVING dt = max(dt)
  198. """
  199. def __init__ (self, db_path):
  200. self.db_path = db_path
  201. def put (self, key, value):
  202. pass
  203. def get (self, key):
  204. pass
  205. # The app was coded before we turned this into a class...
  206. # so we proxy calls with the old interface to this default instance.
  207. DEFAULT = ContentSystem()
  208. def reset ():
  209. print ('compat resetting content system')
  210. DEFAULT = ContentSystem()
  211. def register_content_source (id_prefix, content_source_fn, id_pattern='(\d+)', source_id=None, weight=None):
  212. print('compat register_content_source')
  213. return DEFAULT.register_content_source(id_prefix, content_source_fn, id_pattern, source_id)
  214. def get_content (content_id, content_source_id=None, *extra_args, **extra_kwargs):
  215. print('compat get_content')
  216. return DEFAULT.get_content(content_id, content_source_id, *extra_args, **extra_kwargs)
  217. def get_all_content (content_ids, enable_bulk_fetch=False):
  218. print('compat get_all_content')
  219. return DEFAULT.get_all_content(content_ids, enable_bulk_fetch=enable_bulk_fetch)
  220. def register_hook (hook_type, hook_fn, *extra_args, **extra_kwargs):
  221. print('compat register_hook')
  222. return DEFAULT.register_hook(hook_type, hook_fn, *extra_args, **extra_kwargs)
  223. def invoke_hooks (hook_type, *args, **kwargs):
  224. print('compat invoke_hooks')
  225. return DEFAULT.invoke_hooks(hook_type, *args, **kwargs)