+ Open graph check script

Basically this is going to be used to verify certain open graph tags are actually available We do this by attempting to fetch the intended content locally instead of using bs like opengraph.xyz(which is great but really cumbersome) ! At some point we might want to improve this by adding more features but ! I'll cross that bridge when I get there
2021-09-24 20:55:24 -07:00 · 2021-09-24 20:55:24 -07:00 · 02a0e26dd1
commit 02a0e26dd1
parent 6ce00bae32
1 changed files with 98 additions and 0 deletions
--- a/scripts/check-og-tags.py
+++ b/scripts/check-og-tags.py
@ -0,0 +1,98 @@
+import sys
+from html.parser import HTMLParser
+from requests import get
+
+class OpenGraphTag:
+    '''
+    Represents a single open graph tag
+    Fields:
+        @base_url the base url where the open graph is coming from
+        @prop mapping of the "property" attribute from the page HTML
+        @content mapping of the "content" attribute from the page HTML
+    '''
+    def __init__(self, base_url: str, prop: str, content: str):
+        self.base_url = base_url
+        self.prop = prop
+        self.content = content
+
+    def get_content(self) -> int:
+        '''
+        Attempts to fetch the resource listed in self.content
+        Returns the HTTP status code for the requested resource
+        Useful for making sure that path names in the content fields are correct
+        to avoid pushing up opengraph data that potentially isn't going linked/
+        setup correctly
+        Example:
+            Intention image content: "/media/thumbnail/image.jpg"
+                GET'ing this results in 200 and the image
+            Reality image content:   "/image.jpg"
+                GET'ing this results in 404 and no image
+        '''
+        url = self.content
+        if self.content.startswith('/'):
+            url = self.base_url + self.content
+        response = get(url)
+
+        return response.status_code
+
+
+def fetch_url(url: str) -> (str, str):
+    '''
+    Performs a single GET request to fetch the request URL
+    Returns a tuple containing: (page source, base_url)
+    '''
+    response = get(url)
+    source = str(response.content)
+    parts  = url.split('/') # [ scheme, '', domain, ...]
+    try:
+        base_url = parts[0] + '//' + parts[2]
+        return source, base_url
+    except IndexError:
+        return source, ''
+
+class Parser(HTMLParser):
+
+    # This is a stupid hack to leak data from handle_starttage but fuck it
+    leak = []
+    def handle_starttag(self, tag, attrs):
+        if tag !=  'meta': return
+
+        pairs = {}
+        for attr in attrs: 
+            pairs[attr[0]] = attr[1]
+
+        if str(pairs.get('property')).startswith('og:'):
+            Parser.leak.append(pairs)
+
+
+
+        
+
+def get_meta_tags(source:str, base_url: str) -> list[OpenGraphTag]:
+    '''
+    Extract's the <head> section from a given HTML string
+    Returns None if no <head> section is found
+    '''
+    parser = Parser()
+    parser.feed(source)
+
+    ret = []
+    for entry in parser.leak:
+        ret.append(OpenGraphTag(base_url, entry.get('property'), entry.get('content')))
+
+    parser.leak = []
+    return ret
+
+
+if __name__ ==  '__main__':
+    if len(sys.argv) == 1:
+        print('No url given!', file=sys.stderr)
+        exit(1)
+    else:
+        # Fetch the tags from the link provided
+        tags = get_meta_tags(*fetch_url(sys.argv[1]))
+        for tag in tags:
+            if tag.prop == 'og:image':
+                print(tag.content, tag.get_content())
+
+