+ Open graph check script

Basically this is going to be used to verify certain open graph tags are actually available We do this by attempting to fetch the intended content locally instead of using bs like opengraph.xyz(which is great but really cumbersome) ! At some point we might want to improve this by adding more features but ! I'll cross that bridge when I get there
2021-09-24 20:55:24 -07:00 · 2021-09-24 20:55:24 -07:00 · 02a0e26dd1
commit 02a0e26dd1
parent 6ce00bae32
1 changed files with 98 additions and 0 deletions
--- a/scripts/check-og-tags.py
+++ b/scripts/check-og-tags.py
@ -0,0 +1,98 @@
 import sys
 from html.parser import HTMLParser
 from requests import get
 class OpenGraphTag:
    '''
    Represents a single open graph tag
    Fields:
        @base_url the base url where the open graph is coming from
        @prop mapping of the "property" attribute from the page HTML
        @content mapping of the "content" attribute from the page HTML
    '''
    def __init__(self, base_url: str, prop: str, content: str):
        self.base_url = base_url
        self.prop = prop
        self.content = content
    def get_content(self) -> int:
        '''
        Attempts to fetch the resource listed in self.content
        Returns the HTTP status code for the requested resource
        Useful for making sure that path names in the content fields are correct
        to avoid pushing up opengraph data that potentially isn't going linked/
        setup correctly
        Example:
            Intention image content: "/media/thumbnail/image.jpg"
                GET'ing this results in 200 and the image
            Reality image content:   "/image.jpg"
                GET'ing this results in 404 and no image
        '''
        url = self.content
        if self.content.startswith('/'):
            url = self.base_url + self.content
        response = get(url)
        return response.status_code
 def fetch_url(url: str) -> (str, str):
    '''
    Performs a single GET request to fetch the request URL
    Returns a tuple containing: (page source, base_url)
    '''
    response = get(url)
    source = str(response.content)
    parts  = url.split('/') # [ scheme, '', domain, ...]
    try:
        base_url = parts[0] + '//' + parts[2]
        return source, base_url
    except IndexError:
        return source, ''
 class Parser(HTMLParser):
    # This is a stupid hack to leak data from handle_starttage but fuck it
    leak = []
    def handle_starttag(self, tag, attrs):
        if tag !=  'meta': return
        pairs = {}
        for attr in attrs: 
            pairs[attr[0]] = attr[1]
        if str(pairs.get('property')).startswith('og:'):
            Parser.leak.append(pairs)
 def get_meta_tags(source:str, base_url: str) -> list[OpenGraphTag]:
    '''
    Extract's the <head> section from a given HTML string
    Returns None if no <head> section is found
    '''
    parser = Parser()
    parser.feed(source)
    ret = []
    for entry in parser.leak:
        ret.append(OpenGraphTag(base_url, entry.get('property'), entry.get('content')))
    parser.leak = []
    return ret
 if __name__ ==  '__main__':
    if len(sys.argv) == 1:
        print('No url given!', file=sys.stderr)
        exit(1)
    else:
        # Fetch the tags from the link provided
        tags = get_meta_tags(*fetch_url(sys.argv[1]))
        for tag in tags:
            if tag.prop == 'og:image':
                print(tag.content, tag.get_content())