This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email()
only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(Wd.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', '[email protected]')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', '[email protected]')
('AMETHYST DIAMOND KTV', '[email protected]')
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…