Google Scholar Author Crawler

Crawl author information from Google Scholar and export to TXT file
Wrap lines
// ==UserScript==
// @name         Google Scholar Author Crawler
// @namespace    https://example.com/
// @version      1.0.1
// @description  Crawl author information from Google Scholar and export to TXT file
// @match        https://scholar.google.com/*
// @grant        GM_xmlhttpRequest
// @grant        unsafeWindow
// @license      MIT
// ==/UserScript==
(async () => {
const button = document.createElement("button");
button.textContent = "Crawl Authors";
button.style.position = "fixed";
button.style.top = "20px";
button.style.right = "100px";
button.addEventListener("click", crawlAuthors);
document.body.appendChild(button);
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
async function crawlAuthors() {
const urlParams = new URLSearchParams(window.location.search);
const label = urlParams.get("mauthors").match(/label:([^&]+)/)[1];
const query = `label:${label}`;
const authors = await crawlPage(window.location.href, []);
const rangeElement = document.querySelector("span.gs_nph.gsc_pgn_ppn");
const rangeText = rangeElement.textContent.trim();
const startRange = parseInt(rangeText.split(" ")[0]);
const endRange = parseInt(rangeText.split(" ")[2]);
const fileName = `${query}_${startRange}-${endRange}.txt`;
exportToTxt(authors, fileName);
}
async function crawlPage(url, authors) {
return new Promise(async (resolve) => {
GM_xmlhttpRequest({
method: "GET",
url: url,
onload: async (response) => {
const parser = new DOMParser();
const doc = parser.parseFromString(response.responseText, "text/html");
for (const author of doc.querySelectorAll("div.gs_ai")) {
const name = author.querySelector("h3.gs_ai_name > a").textContent.trim();
const affiliation = author.querySelector("div.gs_ai_aff").textContent.trim();
const fieldsLinks = author.querySelectorAll("div.gs_ai_int > a.gs_ai_one_int");
const separator = ",";
const fields = Array.from(fieldsLinks).map(link => link.textContent.trim().toLowerCase().replace(/[^a-zA-Z0-9]/g, ' ')).join(separator);
const citation_count_element = author.querySelector("div.gs_ai_cby");
const citation_count_match = citation_count_element.textContent.match(/\d+/);
const citation_count = citation_count_match ? citation_count_match[0] : "";
const profileLink = "https://scholar.google.com" + author.querySelector("h3.gs_ai_name > a").getAttribute("href");
const hIndex = await fetchAuthorHIndex(profileLink);
authors.push({
"Name": name,
"Affiliation": affiliation,
"Fields": fields,
"Citation Count": citation_count,
"Profile Link": profileLink,
"H-index": hIndex,
});
}
resolve(authors);
await delay(1000);
}
});
});
}
async function fetchAuthorHIndex(profileLink) {
return new Promise((resolve) => {
GM_xmlhttpRequest({
method: "GET",
url: profileLink,
onload: (response) => {
const parser = new DOMParser();
const doc = parser.parseFromString(response.responseText, "text/html");
const hIndex = doc.querySelectorAll("td.gsc_rsb_std")[2].textContent.trim();
resolve(hIndex);
},
});
});
}
function exportToTxt(authors, fileName) {
const lines = [
"Name\tAffiliation\tFields\tCitation Count\tProfile Link\tH-index",
...authors.map(
(a) =>
`${a.Name}\t${a.Affiliation}\t${a.Fields}\t${a["Citation Count"]}\t${a["Profile Link"]}\t${a["H-index"]}`
),
];
const content = lines.join("\n");
const blob = new Blob([content], { type: "text/plain;charset=utf-8" });
const url = URL.createObjectURL(blob);
const a = document.createElement("a");
a.href = url;
a.download = fileName;
a.click();
URL.revokeObjectURL(url);
}
})();