This example is a bit more complex because it parses the source code in a more particular way depending on it.
The basic idea of this script is to take the content of an Instagram account in the same way as a web browser.
For my account I found a parsing error, I guess the reason is using the points, see
festila.george.catalin.
scripts_content = json.loads(scripts[0].text.strip())
IndexError: list index out of range
In this case comment this line of code and will work:
For the other accounts I've tried, it works very well with the default script.
This is the script I used:
import requests
from bs4 import BeautifulSoup
import json
import re
from pprint import pprint
instagram_url = 'https://instagram.com'
#example user instagram profile_url = sherwoodseries
profile_url=str(input("name of the instagram user: "))
#UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f48b' in posit ion 5022: character maps to
#fix write text file with encoding='utf-8'
file1 = open("_shared_data.txt","w", encoding='utf-8')
#profile_url = 'festila.george.catalin'
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(f"{instagram_url}/{profile_url}", headers = headers)
if response.ok:
html = response.text
bs_html = BeautifulSoup(html, "html.parser")
print(bs_html)
# get info from ... type="application/ld+json">{"@context":"http:\/\/schema.org","@type":"Person","name":
scripts = bs_html.select('script[type="application/ld+json"]')
#scripts_content = json.loads(scripts[0].text.strip())
#pprint(scripts_content)
#print scripts_content like json
#print(json.dumps(scripts_content,indent = 4,sort_keys = True))
#print just part of source code get by 'script' (0 .. n), see n = 6
#print(bs_html.find_all('script')[6])
script_tag = bs_html.find('script', text=re.compile('window\._sharedData'))
shared_data = script_tag.string.partition('=')[-1].strip(' ;')
#get item from shared data, see "language_code":"en"
rex_item = re.compile('(?<=\"language_code\":\")[a-zA-Z_\- ]+(?=\")')
rex_get_item = rex_item.findall(shared_data)
print(rex_get_item)
#get url image from shared data
rex_url = re.compile('(?<=\"display_url\":\")[^\s\"]+(?=\")')
rex_get_url = rex_url.findall(shared_data)
print(rex_get_url)
# load like a json
result_json = json.loads(shared_data)
pprint(result_json)
data = bs_html.find_all('meta', attrs={'property': 'og:description'})
bb = data[0].get('content').split()
user = '%s %s %s' % (bb[-3], bb[-2], bb[-1])
# get from bb parts
posts = bb[4]
print('all string: ',bb)
print('number of posts: ',posts)
print('name and the user: ',user)
# write any output show by print into _a.txt file, see example
#file1.write(str(bs_html.find_all('script')[4]))
#example: write to _shared_data.txt file the shared_data
#file1.write(str(shared_data))
#after write, close the file
#file1.close()
This is a part of the output for sherwoodseries account:
...
all string: ['95', 'Followers,', '24', 'Following,', '56', 'Posts', '-', 'See',
'Instagram', 'photos', 'and', 'videos', 'from', 'Sherwood', 'Series', '(@sherwo
odseries)']
number of posts: 56
name and the user: Sherwood Series (@sherwoodseries)