-
-
Notifications
You must be signed in to change notification settings - Fork 45
Open
Description
Hi, Taniguchi Masaya sama,
As mentioned in my X message, I am having difficulty crawling using the cheerio gs library. The script continues to time out, despite my best work. Please see attached my code below. If there is any support you can lend, it would be greatly appreciated.
function fetchAndExtractBusinessInfo() {
var sheet = SpreadsheetApp.getActiveSpreadsheet().getActiveSheet();
var url = sheet.getRange('B7').getValue();
var maxPages = 10; // Adjust based on the expected number of pages
var batchSize = 3; // Number of pages to process per batch
var retries = 3; // Number of retries for each fetch
var startRow = 7; // Starting row for data
var startTime = new Date().getTime();
var maxExecutionTime = 5 * 60 * 1000; // 5 minutes
// Initialize row counters for each column
var nameRow = startRow;
var numberRow = startRow;
var emailRow = startRow;
var websiteRow = startRow;
for (var page = 1; page <= maxPages; page++) {
if (new Date().getTime() - startTime > maxExecutionTime - 30000) {
// Stop execution if close to the 5-minute limit
break;
}
var paginatedUrl = url + "&page=" + page;
var success = false;
for (var attempt = 0; attempt < retries; attempt++) {
try {
// Fetch HTML content from the URL with a shorter timeout
var response = UrlFetchApp.fetch(paginatedUrl, { muteHttpExceptions: true, timeout: 20000 });
var html = response.getContentText();
if (response.getResponseCode() !== 200) {
throw new Error('Failed to fetch URL: ' + paginatedUrl + ' (status: ' + response.getResponseCode() + ')');
}
// Regular expressions to match the required data
var nameRegex = /<div class="MuiTypography-root jss323 MuiTypography-h3 MuiTypography-displayBlock">([^<]*)<\/div>/g;
var numberRegex = /<span class="MuiButton-label">([^<]*)<\/span>/g;
var emailRegex = /<a[^>]*href="mailto:([^"]*)"/g;
var websiteRegex = /<a class="MuiButtonBase-root MuiButton-root MuiButton-text ButtonWebsite MuiButton-textSecondary MuiButton-fullWidth" href="([^"]*)"/g;
// Extract business names
var businessNames = extractMatches(nameRegex, html);
// Extract business numbers
var businessNumbers = extractMatches(numberRegex, html);
// Extract business email addresses
var businessEmails = extractMatches(emailRegex, html);
// Extract business websites
var businessWebsites = extractMatches(websiteRegex, html);
nameRow = writeDataToSheet(sheet, businessNames, 'C', nameRow);
numberRow = writeDataToSheet(sheet, businessNumbers, 'D', numberRow);
emailRow = writeDataToSheet(sheet, businessEmails, 'E', emailRow);
websiteRow = writeDataToSheet(sheet, businessWebsites, 'F', websiteRow);
success = true;
break;
} catch (e) {
Logger.log('Attempt ' + (attempt + 1) + ' failed: ' + e.toString());
}
}
// Pause between batches to prevent hitting execution time limits
Utilities.sleep(2000);
}
}
// Function to extract matches using a regex pattern
function extractMatches(regex, html) {
var matches = [];
var match;
while (match = regex.exec(html)) {
matches.push(match[1]);
}
return matches;
}
// Function to convert a column letter to its corresponding index
function columnLetterToIndex(column) {
return column.charCodeAt(0) - 'A'.charCodeAt(0) + 1;
}
// Function to write data to the sheet starting at the given cell and return the next available row
function writeDataToSheet(sheet, data, column, startRow) {
var columnIndex = columnLetterToIndex(column);
for (var i = 0; i < data.length; i++) {
sheet.getRange(startRow + i, columnIndex).setValue(data[i]);
}
return startRow + data.length;
}Best regards,
Samuel Shilson-Josling
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels