diff --git a/scripts/check-og-tags.py b/scripts/check-og-tags.py new file mode 100644 index 0000000..671bc52 --- /dev/null +++ b/scripts/check-og-tags.py @@ -0,0 +1,98 @@ +import sys +from html.parser import HTMLParser +from requests import get + +class OpenGraphTag: + ''' + Represents a single open graph tag + Fields: + @base_url the base url where the open graph is coming from + @prop mapping of the "property" attribute from the page HTML + @content mapping of the "content" attribute from the page HTML + ''' + def __init__(self, base_url: str, prop: str, content: str): + self.base_url = base_url + self.prop = prop + self.content = content + + def get_content(self) -> int: + ''' + Attempts to fetch the resource listed in self.content + Returns the HTTP status code for the requested resource + Useful for making sure that path names in the content fields are correct + to avoid pushing up opengraph data that potentially isn't going linked/ + setup correctly + Example: + Intention image content: "/media/thumbnail/image.jpg" + GET'ing this results in 200 and the image + Reality image content: "/image.jpg" + GET'ing this results in 404 and no image + ''' + url = self.content + if self.content.startswith('/'): + url = self.base_url + self.content + response = get(url) + + return response.status_code + + +def fetch_url(url: str) -> (str, str): + ''' + Performs a single GET request to fetch the request URL + Returns a tuple containing: (page source, base_url) + ''' + response = get(url) + source = str(response.content) + parts = url.split('/') # [ scheme, '', domain, ...] + try: + base_url = parts[0] + '//' + parts[2] + return source, base_url + except IndexError: + return source, '' + +class Parser(HTMLParser): + + # This is a stupid hack to leak data from handle_starttage but fuck it + leak = [] + def handle_starttag(self, tag, attrs): + if tag != 'meta': return + + pairs = {} + for attr in attrs: + pairs[attr[0]] = attr[1] + + if str(pairs.get('property')).startswith('og:'): + Parser.leak.append(pairs) + + + + + +def get_meta_tags(source:str, base_url: str) -> list[OpenGraphTag]: + ''' + Extract's the section from a given HTML string + Returns None if no section is found + ''' + parser = Parser() + parser.feed(source) + + ret = [] + for entry in parser.leak: + ret.append(OpenGraphTag(base_url, entry.get('property'), entry.get('content'))) + + parser.leak = [] + return ret + + +if __name__ == '__main__': + if len(sys.argv) == 1: + print('No url given!', file=sys.stderr) + exit(1) + else: + # Fetch the tags from the link provided + tags = get_meta_tags(*fetch_url(sys.argv[1])) + for tag in tags: + if tag.prop == 'og:image': + print(tag.content, tag.get_content()) + +