+ Open graph check script

Basically this is going to be used to verify certain open graph tags are actually available
We do this by attempting to fetch the intended content locally
instead of using bs like opengraph.xyz(which is great but really cumbersome)
! At some point we might want to improve this by adding more features but
! I'll cross that bridge when I get there
This commit is contained in:
shockrah 2021-09-24 20:55:24 -07:00
parent 6ce00bae32
commit 02a0e26dd1

98
scripts/check-og-tags.py Normal file
View File

@ -0,0 +1,98 @@
import sys
from html.parser import HTMLParser
from requests import get
class OpenGraphTag:
'''
Represents a single open graph tag
Fields:
@base_url the base url where the open graph is coming from
@prop mapping of the "property" attribute from the page HTML
@content mapping of the "content" attribute from the page HTML
'''
def __init__(self, base_url: str, prop: str, content: str):
self.base_url = base_url
self.prop = prop
self.content = content
def get_content(self) -> int:
'''
Attempts to fetch the resource listed in self.content
Returns the HTTP status code for the requested resource
Useful for making sure that path names in the content fields are correct
to avoid pushing up opengraph data that potentially isn't going linked/
setup correctly
Example:
Intention image content: "/media/thumbnail/image.jpg"
GET'ing this results in 200 and the image
Reality image content: "/image.jpg"
GET'ing this results in 404 and no image
'''
url = self.content
if self.content.startswith('/'):
url = self.base_url + self.content
response = get(url)
return response.status_code
def fetch_url(url: str) -> (str, str):
'''
Performs a single GET request to fetch the request URL
Returns a tuple containing: (page source, base_url)
'''
response = get(url)
source = str(response.content)
parts = url.split('/') # [ scheme, '', domain, ...]
try:
base_url = parts[0] + '//' + parts[2]
return source, base_url
except IndexError:
return source, ''
class Parser(HTMLParser):
# This is a stupid hack to leak data from handle_starttage but fuck it
leak = []
def handle_starttag(self, tag, attrs):
if tag != 'meta': return
pairs = {}
for attr in attrs:
pairs[attr[0]] = attr[1]
if str(pairs.get('property')).startswith('og:'):
Parser.leak.append(pairs)
def get_meta_tags(source:str, base_url: str) -> list[OpenGraphTag]:
'''
Extract's the <head> section from a given HTML string
Returns None if no <head> section is found
'''
parser = Parser()
parser.feed(source)
ret = []
for entry in parser.leak:
ret.append(OpenGraphTag(base_url, entry.get('property'), entry.get('content')))
parser.leak = []
return ret
if __name__ == '__main__':
if len(sys.argv) == 1:
print('No url given!', file=sys.stderr)
exit(1)
else:
# Fetch the tags from the link provided
tags = get_meta_tags(*fetch_url(sys.argv[1]))
for tag in tags:
if tag.prop == 'og:image':
print(tag.content, tag.get_content())