Automation System for Crawling Information from Sustainalytics
nuxhrr2021. 8. 3. 01:24
1. Imports
import pandas as pd
import numpy as np
import re
import requests
import bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.keys import Keys
import time
data = pd.read_excel(r'C:\dataset\crawling_url_total_.xlsx')
data
company
main_url
sub_url
0
Novo Nordisk A/S
https://www.novonordisk.com/
https://www.novonordisk.com/sustainable-busine...
1
DSV Panalpina A/S
https://www.dsv.com/en/about-dsv/coronavirus
https://www.dsv.com/en/about-dsv/corporate-res...
2
Vestas Wind Systems A/S
https://www.vestas.com/en/about/csr#!inclusive...
https://www.vestas.com/en/investor/financial_r...
3
Orsted A/S
https://orsted.com//en/about-us
https://orsted.com/en/sustainability
4
Genmab A/S
https://www.genmab.com/who-we-are/
https://ir.genmab.com/corporate-social-respons...
...
...
...
...
297
GrowGeneration Corp
https://ir.growgeneration.com/company-information
https://esg.censible.co/companies/GrowGenerati...
298
The Hershey Co
https://www.hersheyland.com/home/about.html
https://www.thehersheycompany.com/en_us/sustai...
299
PepsiCo Inc
https://www.pepsico.com//about/diversity-equit...
https://www.pepsico.com/sustainability/overview
300
Darling Ingredients Inc
https://www.darlingii.com//about-us
https://www.darlingii.com/csr#:~:text=Darling%...
301
Mondelez International
https://www.mondelezinternational.com//About-Us
https://www.mondelezinternational.com/Snacking...
302 rows × 3 columns
companies = data['company']
companies
0 Novo Nordisk A/S
1 DSV Panalpina A/S
2 Vestas Wind Systems A/S
3 Orsted A/S
4 Genmab A/S
...
297 GrowGeneration Corp
298 The Hershey Co
299 PepsiCo Inc
300 Darling Ingredients Inc
301 Mondelez International
Name: company, Length: 302, dtype: object
2. Company Name Preprocessing
abbs = ['A/S', 'Corp', 'Inc', 'Ltd', 'PLC',
'ADR', 'SA', 'Co', 'AG', 'SE', 'SpA',
'S.A.', 'Inc.', 'NV', 'ASA', 'Group',
'Corporation', 'Company']
company_revised = []
for i in data['company']:
splited = i.split(' ')
for j in splited:
if j in abbs:
i = re.sub(j, '', i)
i = i.rstrip()
company_revised.append(i)
company_revised
['Novo Nordisk',
'DSV Panalpina',
'Vestas Wind Systems',
'Orsted',
'Genmab',
'Carlsberg',
'Coloplast',
'Novozymes',
'A. P. Moller Maersk',
'Pandora',
'MSCI',
'Goldman Sachs FS Treasury Intms Instl',
'Microsoft',
'The Home Depot',
'NVIDIA',
'Hess',
'BlackRock',
'Morgan Stanley',
'Honeywell International',
"Lowe's Companies",
'Vale',
'BHP',
'Rio Tinto',
'Petroleo Brasileiro Petrobras',
'Fortescue Metals',
'International Paper',
'Tyson Foods',
'Nutrien',
'Archer-Daniels Midland',
'Veolia Environnement',
'Danaher',
'Ecolab',
'Ferguson',
'Geberit',
'Pentair',
'Roper Technologies',
'Waters',
'American Water Works',
'IDEX',
'Itron',
'Advanced Drainage Systems',
'Xylem',
'A.O. Smith',
'The Toro',
'Royal Bank of Canada',
'Shopify',
'The Toronto-Dominion Bank',
'Enbridge',
'Bank of Nova Scotia',
'Canadian National Railway',
'Brookfield Asset Management',
'Bank of Montreal',
'Canadian Pacific Railway',
'TC Energy',
'Investor',
'Nordea Bank Abp',
'Atlas Copco',
'Telefonaktiebolaget L M Ericsson',
'Volvo',
'KONE Oyj',
'Nokia Oyj',
'Aptiv',
'Johnson Controls International',
'Eaton',
'ABB',
'Schneider Electric',
'Enphase Energy',
'Samsung SDI',
'SolarEdge Technologies',
'Prysmian',
'Terna',
'Canadian Imperial Bank of Commerce',
'Essential Utilities',
'AstraZeneca',
'Unilever',
'HSBC Holdings',
'Diageo',
'GlaxoSmithKline',
'Rio Tinto',
'BP',
'British American Tobacco',
'Royal Dutch Shell',
'Welltower',
'Ventas',
'Healthpeak Properties',
'Omega Healthcare Investors',
'Orpea',
'Amedisys',
'LHC',
'Ensign',
'Aedifica',
'Sabra Health Care REIT',
'Alphabet',
'Visa',
'The Walt Disney',
'Salesforce.com',
'Tesla',
'Intel',
'Procter & Gamble',
'ASML Holding',
'Roche Holding',
'Nestle',
'AIA',
'Allianz',
'Wesfarmers',
'Hong Kong Exchanges and Clearing',
'ITOCHU Techno-Solutions',
'Admiral',
'MS&AD Insurance Holdings',
'Henkel',
'RELX',
'Croda International',
'Novartis',
'Scout24',
'Royal Mail',
'Investec',
'BT',
'Morrison (Wm) Supermarkets',
'Glencore',
'EVRAZ',
'Ashtead',
'Entain',
'B&M European Value Retail',
'Spectris',
'Equinor',
'DNB',
'Telenor',
'Mowi',
'Yara International',
'Norsk Hydro',
'Orkla',
'Tomra Systems',
'Adevinta',
'Gjensidige Forsikring',
'Dj U.S. Semiconductors Index Swap Societe Generale',
'Broadcom',
'Texas Instruments',
'Qualcomm',
'Wipro',
'Akzo Nobel',
'Magna International',
'Vivendi',
'Tokyo Electron',
'Nippon Telegraph & Telephone',
'Mitsubishi',
'Power of Canada',
'Commonwealth Bank of Australia',
'BHP',
'CSL',
'Westpac Banking',
'National Australia Bank',
'Australia and New Zealand Banking',
'Macquarie',
'Woolworths',
'Rio Tinto',
'LVMH Moet Hennessy Louis Vuitton',
'Toyota Motor',
'SAP',
'Berkshire Hathaway',
'JPMorgan Chase &',
'Bank of America',
'Wells Fargo &',
'Citigroup',
'Goldman Sachs',
'Sony',
"L'Oreal",
'Keyence',
'Siemens',
'Daikin Industries',
'AXA',
'CRH',
'Flutter Entertainment',
'Kerry',
'Kingspan',
'Smurfit Kappa',
'Ryanair Holdings',
'Grafton Shs',
'Icon',
'Glanbia',
'Bank of Ireland',
'Intermediate Capital',
'Smith (DS)',
'Rightmove',
'Howden Joinery',
'Electrocomponents',
'Weir',
'Pennon',
'Dechra Pharmaceuticals',
'IMI',
'Barclays',
'Emera',
'AerCap Holdings N.V.',
'Wells Fargo &',
'JPMorgan Chase & Co.',
'Societe Generale',
'Credit Suisse',
'Enbridge Incorporation',
'Lloyds Banking plc',
'Global Atlantic Finance',
'Assa Abloy',
'Hexagon',
'Sandvik',
'Evolution AB',
'Skandinaviska Enskilda Banken',
'UPM-Kymmene Oyj',
'Avery Dennison',
'Amcor Ordinary Shares',
'Weyerhaeuser',
'Mondi',
'Packaging of America',
'WestRock',
'Stora Enso Oyj',
"Land O'Lakes",
'Australia and New Zealand Banking',
'Assured Guaranty Municipal Holdings',
'Markel',
'Dominion Energy',
'The Bank of New York Mellon',
'Wells Fargo &',
'Duke Energy',
'Citigroup',
'HSBC Capital',
'TransCanada Trust',
'Bank of America',
'Sumitomo Life Insurance',
'MetLife',
'NatWest plc',
'Nordea Bank ABP',
'Credit Agricole',
'Societe Generale',
'HSBC Holdings plc',
'ING Groep N.V.',
'UBS Funding (Switzerland)',
'Standard Chartered plc',
'AT&T',
'Computershare',
'Brother Industries',
'Daiwa House Industry',
'Koninklijke Ahold Delhaize',
'Lawson',
'T&D Holdings',
'ITOCHU',
'Toyota Tsusho',
'AGL Energy',
'Telefonica',
'BNP Paribas',
'Banco Santander',
'Zurich Insurance',
'UBS',
'ING Groep',
'Prudential',
'Lloyds Banking',
'Prosus',
'Koninklijke Philips',
'Adyen',
'Koninklijke DSM',
'Wolters Kluwer',
'Heineken',
'BlackRock Cash Funds Treasury SL Agency',
'Bank of Nova Scotia',
'Avangrid',
'Target',
'Ingersoll-Rand Global Holdings Limited',
'Banco Bilbao Vizcaya Argentaria,',
'Target',
'Kellogg',
'Automatic Data Processing,',
'Sands China',
'NextEra Energy',
'Exelon',
'Public Service Enterprise',
'Sempra Energy',
'Alliant Energy',
'CMS Energy',
'Xcel Energy',
'Fortis',
'NiSource',
'Northrop Grumman',
'General Dynamics',
'Keurig Dr Pepper',
'Altria',
'Hormel Foods',
'Nucor',
'Vulcan Materials',
'Conagra Brands',
'Clorox',
'Taiwan Semiconductor Manufacturing',
'ASML Holding',
'Advanced Micro Devices',
'Micron Technology',
'Applied Materials',
'Marvell Technology',
'Analog Devices',
'KLA',
'JM Smucker',
'General Mills',
'Bunge',
'GrowGeneration',
'The Hershey',
'PepsiCo',
'Darling Ingredients',
'Mondelez International']
error = []
driver = webdriver.Chrome(r'C:\Users\esthe\chromedriver.exe')
search_tool = 'https://www.sustainalytics.com/esg-ratings/?utm_term=&utm_campaign=Leads-Search-20&utm_source=adwords&utm_medium=ppc&hsa_acc=4619360780&hsa_cam=11145778763&hsa_grp=108965194933&hsa_ad=514798435870&hsa_src=g&hsa_tgt=dsa-437115340933&hsa_kw=&hsa_mt=b&hsa_net=adwords&hsa_ver=3&gclid=CjwKCAjwjJmIBhA4EiwAQdCbxte5HszKf3XYqp6OXKKctLCPuuJMvdgh3IcVVHgKlauc2WKjO1p5YRoCl6UQAvD_BwE'
num = 0
for comp in company_revised:
try:
sustainalytics(comp)
num += 1
except NoSuchElementException:
print(comp)
error.append(comp)
continue
nunm += 1
A. P. Moller Maersk
MSCI
Goldman Sachs FS Treasury Intms Instl
Lowe's Companies
Petroleo Brasileiro Petrobras
Archer-Daniels Midland
Geberit
Advanced Drainage Systems
A.O. Smith
Bank of Montreal
Atlas Copco
Telefonaktiebolaget L M Ericsson
Amedisys
Ensign
Sabra Health Care REIT
Hong Kong Exchanges and Clearing
Novartis
Morrison (Wm) Supermarkets
Adevinta
Dj U.S. Semiconductors Index Swap Societe Generale
Power of Canada
Westpac Banking
Australia and New Zealand Banking
LVMH Moet Hennessy Louis Vuitton
Keyence
Grafton Shs
Smith (DS)
Rightmove
Enbridge Incorporation
Lloyds Banking plc
Global Atlantic Finance
Amcor Ordinary Shares
Packaging of America
Stora Enso Oyj
Land O'Lakes
Australia and New Zealand Banking
Assured Guaranty Municipal Holdings
HSBC Capital
TransCanada Trust
Sumitomo Life Insurance
NatWest plc
UBS Funding (Switzerland)
Lloyds Banking
Koninklijke DSM
BlackRock Cash Funds Treasury SL Agency
Ingersoll-Rand Global Holdings Limited
Banco Bilbao Vizcaya Argentaria,
Automatic Data Processing,
Xcel Energy
Fortis
NiSource
Keurig Dr Pepper
Marvell Technology
JM Smucker
General Mills
GrowGeneration
Darling Ingredients
num = 0
for i in error:
print(num, i)
num += 1
0 A. P. Moller Maersk
1 MSCI
2 Goldman Sachs FS Treasury Intms Instl
3 Lowe's Companies
4 Petroleo Brasileiro Petrobras
5 Archer-Daniels Midland
6 Geberit
7 Advanced Drainage Systems
8 A.O. Smith
9 Bank of Montreal
10 Atlas Copco
11 Telefonaktiebolaget L M Ericsson
12 Amedisys
13 Ensign
14 Sabra Health Care REIT
15 Hong Kong Exchanges and Clearing
16 Novartis
17 Morrison (Wm) Supermarkets
18 Adevinta
19 Dj U.S. Semiconductors Index Swap Societe Generale
20 Power of Canada
21 Westpac Banking
22 Australia and New Zealand Banking
23 LVMH Moet Hennessy Louis Vuitton
24 Keyence
25 Grafton Shs
26 Smith (DS)
27 Rightmove
28 Enbridge Incorporation
29 Lloyds Banking plc
30 Global Atlantic Finance
31 Amcor Ordinary Shares
32 Packaging of America
33 Stora Enso Oyj
34 Land O'Lakes
35 Australia and New Zealand Banking
36 Assured Guaranty Municipal Holdings
37 HSBC Capital
38 TransCanada Trust
39 Sumitomo Life Insurance
40 NatWest plc
41 UBS Funding (Switzerland)
42 Lloyds Banking
43 Koninklijke DSM
44 BlackRock Cash Funds Treasury SL Agency
45 Ingersoll-Rand Global Holdings Limited
46 Banco Bilbao Vizcaya Argentaria,
47 Automatic Data Processing,
48 Xcel Energy
49 Fortis
50 NiSource
51 Keurig Dr Pepper
52 Marvell Technology
53 JM Smucker
54 General Mills
55 GrowGeneration
56 Darling Ingredients
error2 = []
driver = webdriver.Chrome(r'C:\Users\esthe\chromedriver.exe')
search_tool = 'https://www.sustainalytics.com/esg-ratings/?utm_term=&utm_campaign=Leads-Search-20&utm_source=adwords&utm_medium=ppc&hsa_acc=4619360780&hsa_cam=11145778763&hsa_grp=108965194933&hsa_ad=514798435870&hsa_src=g&hsa_tgt=dsa-437115340933&hsa_kw=&hsa_mt=b&hsa_net=adwords&hsa_ver=3&gclid=CjwKCAjwjJmIBhA4EiwAQdCbxte5HszKf3XYqp6OXKKctLCPuuJMvdgh3IcVVHgKlauc2WKjO1p5YRoCl6UQAvD_BwE'
for comp in error:
try:
sustainalytics(comp)
except NoSuchElementException:
print(comp)
error2.append(comp)
continue
MSCI
Advanced Drainage Systems
Amedisys
Ensign
Sabra Health Care REIT
Global Atlantic Finance
Land O'Lakes
TransCanada Trust
Sumitomo Life Insurance
Automatic Data Processing,
GrowGeneration
Darling Ingredients