<?php
require_once('server_config.php');
require_once('html_reader.php');
require_once('error_log.php');

$pdo = require 'db_config.php';

$category = array();
$message = "";
$m_category_sk = 0;
$sub_cat_buttons ="";

$objLog =  new LogFile();
$hd_parent_url ="";
//$hd_parent_sk = 0;
//print_r($_POST);
//$_POST["site_url"]="https://load-cells.org/";
if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['btn_sub_cat'])) {
    // Output the name of the button clicked
    $url = htmlspecialchars($_POST['btn_sub_cat']);
    if(!str_ends_with($url, '/'))
    {
        $url = $url."/";
    }
    $hd_parent_sk = $_REQUEST["hd_parent_sk"];
    $hd_parent_url = $_REQUEST["hd_parent_url"];

    //we need to check whether it is already processed
    $subcat_id = check_exists_url_in_microcategory($url);

    $objLog->set_name($url);
    $objLog->write_to_file('*******\r\n\r\n'.$url."\r\n\r\n*******\r\n\r\n");
    $objLog->write_to_file("\r\n\r\nSub Category Button is clicked\r\n\r\n");
    if (!is_numeric($subcat_id)) {
        $objLog -> write_to_file("\r\n\r\n going to scrape related category link ". $url."\r\n\r\n");
        scraper($url, "");
    }
    else {
        $objLog -> write_to_file("\r\n\r\n display related category buttons ". $url."\r\n\r\n");
        $sub_cat_buttons = get_related_categories($hd_parent_sk);
    }
}
else if($_POST["site_url"])
{
    $url = $_POST['site_url'];
    if(!str_ends_with($url, '/'))
    {
        $url = $url."/";
    }
    $name = $_POST['cat_name'];

    $objLog->set_name($url);
    $objLog->write_to_file('*******\r\n'.$url."\r\n*******\r\n\r\n");
    $objLog->write_to_file("\r\n*******\r\n\r\n Parent Category ". $name. "is submitted \r\n*******\r\n\r\n");

    if($_POST["hd_parent_sk"]) {
        $hd_parent_sk = $_POST["hd_parent_sk"];

    }
    $hd_parent_url = $url;
        $parent_id = check_exists_url_in_microcategory(trim($url));
        if (!empty($parent_id) && is_numeric($parent_id))
        {
            $hd_parent_sk = $parent_id;
            $message = "\r\n\r\n<br/><br/>Category URL:<h1> '" . $url . "' </h1> already exists. \r\n<br/><br/> Skipping insertion.\r\n\r\n";
            //$message .= "Hidden ParentSk=".$hd_parent_sk." & id from db =".$parent_id;
            // get related categories from table and display them
            $objLog->write_to_file('*******\r\n LineNo 55 - parent category already processed. Now goingto fetch related categories \r\n*******\r\n\r\n');
            $sub_cat_buttons =  get_related_categories($hd_parent_sk);
        } else {
            // call the function to start scraping from the initial URL
            $objLog->write_to_file('*******\r\n LineNo 59 - parent category going to process \r\n*******\r\n\r\n');
            scraper($url, $name);
        }
}
else
{
    header("Location:  ".SITE_URL."micro_site_scraper.php");
}

// create a function to scrape product data from a given URL
function scraper($url, $name)
{
    global $m_category_sk;
    global $hd_parent_sk;
    global $message;
    global $sub_cat_buttons;
    global $objLog;
    $state_url = $url."/category/alabama/";
    $power_data = array();

    $htmlContent = get_html_content($url);

    // check for errors
    if (str_contains($htmlContent,"cURL error"))
    {
        $message = "cURL error: " . $htmlContent;
        $objLog->write_to_file($message);
        exit;
    }

    // create a new Simple HTML DOM instance
    $html = str_get_html($htmlContent);


    $category['NAME'] = $name;
    $category['CATEGORY_URL'] = $url;
    $category['PRIMARY_PARENT_SK'] = $hd_parent_sk;
    //$category['SHORT_DESCRIPTION'] = $html->find("div.content-top");
    $category['INDUSTRY_DESCRIPTION'] = "";

    $dom = new DOMDocument;
    //Suppress warnings/errors caused by malformed HTML
    libxml_use_internal_errors(true);
    $dom->loadHTML($htmlContent);
    libxml_clear_errors();


    $xpath = new DOMXPath($dom);


    $linkNode = $xpath->query("//h2[@class='more-listings-page-link']/a");
    // Check if the node is found
    if ($linkNode->length > 0) {
        // Get the href attribute of the first <a> tag
        $page2_url = $url. $linkNode->item(0)->getAttribute('href');
    }

    // Query the 'main-heading' div
    $DisplayName = $xpath->query("//div[@class='main-heading']");

    if ($DisplayName->length > 0) {
        // Access the first div element with 'main-heading' class
        $mainHeadingDiv = $DisplayName->item(0);

        // Retrieve the <h1> element inside the 'main-heading' div
        $h1DisplayName = $mainHeadingDiv->getElementsByTagName('h1');

        // Check if an <h1> exists and get its content
        if ($h1DisplayName->length > 0) {
            $h1DisplayText = $h1DisplayName->item(0)->textContent; // Extracts the text of <h1>
            //echo $h1DisplayText; // Output the content of the <h1>
        } else {
            $h1DisplayText  = ""; // No <h1> tag found
        }
    } else {
        $h1DisplayText  = ""; // No 'main-heading' div found
    }

    $category['H1DISPLAY_NAME'] =  $h1DisplayText ;

    // Query the 'main-heading' div
    $shortDesc = $xpath->query("//div[@class='content-top']");

    if ($shortDesc->length > 0)
    {
        // Access the first div element with 'main-heading' class
        $shortDescDiv = $shortDesc->item(0);
        // Retrieve the <h1> element inside the 'main-heading' div
        $pShortDesc = $shortDescDiv->getElementsByTagName('p');

        // Check if an <h1> exists and get its content
        if ($pShortDesc->length > 0) {
            $h1ShortText = $pShortDesc->item(0)->textContent; // Extracts the text of <h1>
           // echo  "shortDesc".$h1ShortText; // Output the content of the <h1>
        } else {
            $h1ShortText = ""; // No <h1> tag found
        }
    } else {
        $h1ShortText = ""; // No 'main-heading' div found
    }

    $category['SHORT_DESCRIPTION'] =  $h1ShortText;

   // print_r($category['SHORT_DESCRIPTION']);
    $contentBottom = $xpath->query("//div[@class='content-bottom']");

    if ($contentBottom->length > 0) {
        /*
        $paragraphs = $contentBottom->item(0)->getElementsByTagName('p');
        foreach ($paragraphs as $p) {
            //echo $p->textContent . "\n"; // Outputs just the text content of <p>
            $category['INDUSTRY_DESCRIPTION'] .= $dom->saveHTML($p); // Outputs the entire <p> tag with its content
        }
        */
        $contentDiv = $contentBottom->item(0);

        // Find all <h2> elements with the class 'more-listings-page-link' inside 'content-bottom'
        $h2ToRemove = $xpath->query(".//h2[contains(@class, 'more-listings-page-link')]", $contentDiv);
        // Remove each matching <h2>
        foreach ($h2ToRemove as $h2) {
            $h2->parentNode->removeChild($h2);
        }

        $ppToRemove = $xpath->query(".//div[contains(@id, 'powerpage')]", $contentDiv);
        // Remove each matching <h2>
        foreach ($ppToRemove as $pp) {
            $pp->parentNode->removeChild($pp);
        }
       // echo $dom->saveHTML( $contentDiv);
        $category['INDUSTRY_DESCRIPTION'] = $dom->saveHTML( $contentDiv);
    } else {
        $message .= "<br/> No 'content-bottom' div found.";
    }


    /* These 3 values need to get from state search page and page 2 page */
    //Update these values in state search and page2 scrapers.

    $category['STATESEARCH_DESCRIPTION'] = "";
    $category['P2_H1DISPLAY_NAME'] = "";
    $category['P2_SHORT_DESCRIPTION'] = "";

    /**** END OF PARENT PAGE SCRAPING */////


    $m_category_sk = insert_to_catMic($category);
    //echo "name".$name."<br/>";
    if($name != "") {
        $hd_parent_sk = $m_category_sk;
        $hd_parent_url = $url;
        //echo "hidden parent". $hd_parent_sk."<br/>";
    }
    else {
        $hd_parent_sk = $_REQUEST["hd_parent_sk"];
        $hd_parent_url = $_REQUEST["hd_parent_url"];
    }

    if($m_category_sk > 0) {

        $message .= "<br/>\n\n URL: ". $url;
        $message .= "<br/>\n\n Data inserted successfully! New Category Id :" . $m_category_sk."\n\n";

        //Next we'll insert powerpages data to powerpage table
        //For that fetch powerpage content
        // Query all <article> elements inside the 'powerpage-inner' div
        $articles = $xpath->query("//div[@id='powerpage-inner']/article");

        // Loop through each <article> and extract details
        foreach ($articles as $article) {
            // Get the name (from <h3>)
            $nameNode = $xpath->query(".//h3", $article);
            $name = $nameNode->length > 0 ? trim($nameNode->item(0)->nodeValue) : "N/A";

            // Get the image URL (from <img>)
            $imageNode = $xpath->query(".//a[@class='image']/img", $article);
            $imageURL = $imageNode->length > 0 ? $imageNode->item(0)->getAttribute('src') : "N/A";

            // Get the description (from <p>)
            $descriptionNode = $xpath->query(".//p", $article);
            $description = $descriptionNode->length > 0 ? trim($descriptionNode->item(0)->nodeValue) : "N/A";

            // Get the URL (from <a>)
            $urlNode = $xpath->query(".//a[@class='image']", $article);
            $pageURL = $urlNode->length > 0 ? $urlNode->item(0)->getAttribute('href') : "N/A";

            // create an array of the extracted data
            $powerInfo = array(
                "M_CATEGORY_SK" =>  $m_category_sk,
                "M_CATEGORY_URL" => $url,
                "NAME" => $name,
                "IMAGE" => $imageURL,
                "DESCRIPTION" => $description,
                "URL" => $pageURL
            );

            // append the extracted data to the empty product array
            $powerData[] = $powerInfo;
        }
        insert_power_page($powerData);

        //Now we'll scrape through all subcategories
        $sub_links = $xpath->query("//ul[@id='menu-categories']/li/a");
        //$sub_cat_buttons = "<form method='POST' action=''>";

        // Loop through each <a> tag and extract the href attribute
        foreach ($sub_links as $sub_cat_link) {
            // Get the href attribute
            $href = $sub_cat_link -> getAttribute('href');
            // Get the text (name) of the link
            $name = trim($sub_cat_link->nodeValue);


            // create an array of the extracted data
            $rltdInfo = array(
                "M_CATEGORY_SK" =>  $m_category_sk,
                "CATEGORY_URL" => $href,
                "CATEGORY_NAME" => $name
            );

            // append the extracted data to the empty product array
            $rltdData[] = $rltdInfo;
           // $sub_cat_buttons .= "<div style='margin:0 auto; padding:5px'><button type='submit' name='btn_sub_cat' value='$href' >$name</button></div>";
        }

        //$sub_cat_buttons .= "</form>";
        insert_rltd_cat($rltdData);


        //Insert advertisement details to advt table
        //Now, we'll fetch page1 advertisements for this category
        require_once('adv_scraper.php');
        $message .= "<br/>\n\n". process_adv_html($html, $m_category_sk, "1");
        /**** END OF PARENT PAGE SCRAPING */////

        //scrape page2 advert
        $message .= "<br/>\n\n". adv_scraper($page2_url, $m_category_sk,"2");

        $message .= "<br/>\n\n". adv_scraper($state_url, $m_category_sk,"3");

        //UPDATE SCRAPED STATUS
        updt_scraped_status($hd_parent_sk, $url);
        $sub_cat_buttons =  get_related_categories($hd_parent_sk);

    }
    else
    {

    }
    $objLog->write_to_file($message);
}



function updt_scraped_status($m_category_sk,$url)
{
    global $pdo;
    global $message;
    global $objLog;
    try
    {
        $sql = 'UPDATE dbo."M_CATEGORY_RLTD" SET "SCRAPED"=:M_SCRAPED WHERE "M_CATEGORY_SK"=:M_CATEGORY_SK AND "CATEGORY_URL"=:M_URL';

        $stmt = $pdo->prepare($sql);
        $stmt->execute([':M_SCRAPED' => 'Y',
            'M_CATEGORY_SK' => $m_category_sk,
            'M_URL' => $url
        ]);

    }
    catch (Exception $e)
    {
        $message = "<br/>Error in updating caregory table with scraped status: " . $e->getMessage();
    }
    $objLog->write_to_file($message);
}

function check_exists_url_in_microcategory($url)
{
    global $pdo;
    global $message;
    global $objLog;
    $catId = 0;
    try {
        // Check if the client name already exists
        //$checkSql = 'SELECT COUNT(*) FROM dbo."M_CATEGORY" WHERE "CATEGORY_URL" = :M_URL';
        $checkId = 'SELECT "M_CATEGORY_SK" FROM dbo."M_CATEGORY" WHERE "CATEGORY_URL" = :M_URL';
        $checkStmt = $pdo->prepare($checkId);
        $checkStmt->execute([':M_URL' => $url]);
        $catId = $checkStmt->fetchColumn();
    } catch (PDOException $e) {
        $message = "<br/>Error: " . $e->getMessage();
    }
    $objLog->write_to_file($message);
    return $catId;
}

function insert_to_catMic($catInfo)
{
    //print_r($catData);
    global $pdo;
    global $message;
    global $objLog;
    $new_cat_id = 0;
	$fkey  =0;
    try {
            // Check if the client name already exists
            $checkSql = 'SELECT COUNT(*) FROM dbo."M_CATEGORY" WHERE "CATEGORY_URL" = :M_CATEGORY_URL';
            $checkStmt = $pdo->prepare($checkSql);
            $checkStmt->execute([':M_CATEGORY_URL' => $catInfo['CATEGORY_URL']]);
            $exists = $checkStmt->fetchColumn();

            if ($exists) {
                $message =  "\r\n\r\nCategory '" . $catInfo['CATEGORY_URL'] . "' already exists. Skipping insert.\r\n\r\n";
            }
            else {
				//first fetch categorySk from category table
				// Parse the host from the URL
				$parsed_url = parse_url($catInfo['CATEGORY_URL'], PHP_URL_HOST);

				// Ensure "www." prefix if missing
				if (strpos($parsed_url, "www.") === false) {
					$parsed_url = "www." . $parsed_url;
				}
				
				 try {
					$stmt = $pdo->prepare('SELECT "CATEGORY_SK" FROM dbo."IQS_CATEGORY" WHERE "CATEGORY_URL" = :M_URL');
					$stmt->execute([':M_URL' => $parsed_url]);
					$sk = $stmt->fetchAll(PDO::FETCH_ASSOC);
					
					if (!empty($sk)) {
				
						foreach ($sk as $cat_sk) {
							
							echo $fkey = $cat_sk['CATEGORY_SK'];
						}
					}
				}
				catch (PDOException $e) {
					// Log and handle error
					$objLog->error("Database error: " . $e->getMessage());
					throw $e; // Re-throw to be handled by the caller
				}
                // Prepare the insert statement
                $sql = 'INSERT INTO dbo."M_CATEGORY" ("NAME", "CATEGORY_URL","CATEGORY_SK", "DISPLAY_NAME", "H1DISPLAY_NAME","PRIMARY_PARENT_SK", "SHORT_DESCRIPTION",  "STATESEARCH_DESCRIPTION", "INDUSTRY_DESCRIPTION","P2_H1DISPLAY_NAME", "P2_SHORT_DESCRIPTION", "USER_CREATED","DATE_CREATED")
                VALUES (:M_NAME, :M_CATEGORY_URL, :CATEGORY_SK,:M_DISPLAY_NAME, :M_H1DISPLAY_NAME, :M_PRIMARY_PARENT_SK, :M_SHORT_DESCRIPTION, :M_STATESEARCH_DESCRIPTION, :M_INDUSTRY_DESCRIPTION,:M_P2_H1DISPLAY_NAME, :M_P2_SHORT_DESCRIPTION, :M_USER_CREATED,:M_DATE_CREATED ) RETURNING "M_CATEGORY_SK"';
                $stmt = $pdo->prepare($sql);
               // $stmt->debugDumpParams();
                $stmt->execute([
                    ':M_NAME' => $catInfo['NAME'],
                    ':M_CATEGORY_URL' => $catInfo['CATEGORY_URL'],
					':CATEGORY_SK' => $fkey,
                    ':M_DISPLAY_NAME' => 'Leading Load Cell Manufacturers',
                    ':M_H1DISPLAY_NAME' => $catInfo['H1DISPLAY_NAME'],
                    ':M_PRIMARY_PARENT_SK' => '0',
                    ':M_SHORT_DESCRIPTION' => $catInfo['SHORT_DESCRIPTION'],
                    ':M_STATESEARCH_DESCRIPTION' => $catInfo['STATESEARCH_DESCRIPTION'],
                    ':M_INDUSTRY_DESCRIPTION' => $catInfo['INDUSTRY_DESCRIPTION'],
                    ":M_P2_H1DISPLAY_NAME" => $catInfo['P2_H1DISPLAY_NAME'],
                    ":M_P2_SHORT_DESCRIPTION" => $catInfo['P2_SHORT_DESCRIPTION'],
                    ':M_USER_CREATED' => 'Sumi',
                    ':M_DATE_CREATED' => date("Y-m-d")
                ]);
                $new_cat_id = $stmt->fetchColumn();
            }
       //$new_cat_id = $pdo->lastInsertId();


    } catch (PDOException $e) {
        $message = "<br/>Error: " . $e->getMessage();
    }
    $objLog->write_to_file($message);
    return $new_cat_id;
}

function insert_power_page($powerData)
{
    global $pdo;
    global $message;
    global $objLog;
    try {
        // Prepare the insert statement
        $sql = 'INSERT INTO dbo."M_POWER_PAGE" ("M_CATEGORY_SK", "M_CATEGORY_URL", "NAME","IMAGE", "DESCRIPTION", "URL")
            VALUES (:M_CATEGORY_SK, :M_CATEGORY_URL ,  :M_NAME, :M_IMAGE, :M_DESCRIPTION, :M_URL )';

        $stmt = $pdo->prepare($sql);
        //$stmt->debugDumpParams();

        foreach ($powerData as $power) {
            $stmt->execute([
                ':M_CATEGORY_SK' => $power['M_CATEGORY_SK'],
                ':M_CATEGORY_URL' => $power['M_CATEGORY_URL'],
                ':M_NAME' => $power['NAME'],
                ':M_IMAGE' => $power['IMAGE'],
                ':M_DESCRIPTION' => $power['DESCRIPTION'],
                ':M_URL' => $power['URL']
            ]);
        }
        $message .= " Power Data inserted successfully!";
    } catch (PDOException $e) {
        $message = "<br/>Error: " . $e->getMessage();
    }
    $objLog->write_to_file($message);
}

function insert_rltd_cat($rltdData)
{
    global $pdo;
    global $message;
    global $objLog;
    try {

        // Prepare the insert statement
        $sql = 'INSERT INTO dbo."M_CATEGORY_RLTD" ("M_CATEGORY_SK", "CATEGORY_URL", "CATEGORY_NAME") VALUES (:M_CATEGORY_SK, :M_CATEGORY_URL ,  :M_NAME )';

        $stmt = $pdo->prepare($sql);
        //$stmt->debugDumpParams();
        foreach ($rltdData as $rltCat) {
            $stmt->execute([
                ':M_CATEGORY_SK' => $rltCat['M_CATEGORY_SK'],
                ':M_CATEGORY_URL' => $rltCat['CATEGORY_URL'],
                ':M_NAME' => $rltCat['CATEGORY_NAME']
            ]);
        }
        $message .= " Related Categories inserted successfully!";
    } catch (PDOException $e) {
        $message = "<br/>Error: " . $e->getMessage();
    }
    $objLog->write_to_file($message);
}

function get_related_categories($hd_parent_sk)
{
    global $pdo;
    global  $objLog;
    global $message;
    // Validate $hd_parent_sk
    if (empty($hd_parent_sk) || !is_numeric($hd_parent_sk)) {
        throw new InvalidArgumentException("Invalid 'hd_parent_sk' parameter: Must be a non-empty numeric value.");
    }

    try {
        $stmt = $pdo->prepare('SELECT "CATEGORY_NAME", "CATEGORY_URL", "SCRAPED" FROM dbo."M_CATEGORY_RLTD" WHERE "M_CATEGORY_SK" = :M_SK');
        $stmt->execute([':M_SK' => $hd_parent_sk]);
        $categories = $stmt->fetchAll(PDO::FETCH_ASSOC);
    }
    catch (PDOException $e) {
        // Log and handle error
        $objLog->error("Database error: " . $e->getMessage());
        throw $e; // Re-throw to be handled by the caller
    }
    $html ="";
    try{
        // Display the categories
        if (!empty($categories)) {
            $html = "";
            foreach ($categories as $category) {
                $scraped ='';
                $name = htmlspecialchars($category['CATEGORY_NAME']);
                $url = htmlspecialchars($category['CATEGORY_URL']);
                if($category['SCRAPED']=='N')
                    $scraped    = 'ENABLED';
                else
                    $scraped    = 'DISABLED';
                //$html.= "<li><a href=\"$url\">$name</a></li>";
                $html .= "<div style='margin:0 auto; padding:5px'><button type='submit' name='btn_sub_cat' value='$url' $scraped >$name</button>";
                //if($scraped    == 'DISABLED')
                  //  $html.= "&nbsp;&nbsp;&nbsp; <button name='btn_update_srchdesc' id='btn_update_srchdesc' onclick='return ajax_updt_srch_desc(\''.$hd_parent_sk.'\')>Update State Search Description for Companies</button>";
                $html .= "</div>";
            }

        } else {
            $message = "No relatted  categories found.";
        }
        return $html;
    }
    catch (PDOException $e) {
       $message = "Error fetching categories: " . $e->getMessage();
    }
    $objLog->write_to_file($message);

}
// clean up resources
//$html->clear();
?>
<!DOCTYPE html>
<html lang="en">
<head>
    <title>Micro Site Scraper</title>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
    <script type="text/javascript">

            function ajax_updt_srch_desc(category_sk, page_url) {
                alert("State Search Scraping Initiated");
                $("#btnUpdtSrch").prop("disabled", true);
                $.ajax({
                    type: "post",
                    url: "state_scraper.php",
                    data: {
                        'categorySk': category_sk,
                        'page_url': page_url
                    },
                    cache: false,
                    success: function (html) {
                        alert('Search State Updated for Category Sk' + category_sk);
                        $('#state_msg').html(html);
                        $("#btnUpdtSrch").prop("disabled", false);
                    }
                });
                return false;
            }

    </script>
    <style>
       #state_msg {
            height: 200px; /* Set a fixed height */
            overflow-y: scroll; /* Enable vertical scrolling */
            border: 1px solid #ccc; /* Optional: Add a border for better visibility */
        }
    </style>
</head>
<body>
<div class="jumbotron text-center">
    <h1>Micro Site Scraper</h1>
    <h3>Insync Tech Solutions</h3>
</div>
<div class="container-fluid" style="background-color: #00007d; color:#fff">
    <h6><?php echo "URL". $url; ?></h6>
    <h6><?php echo $message; ?></h6>

</div>
<form name="frmScrape" id="name="frmScrape" method="POST">
    <div class="bg-success">
    <h2>Scrape Related Categories</h2>
    <?=$sub_cat_buttons;?>
    </div>
</form>
<?php if($hd_parent_sk>0) { ?>
    <div style='margin:0 auto; padding:5px'><button id="btnUpdtSrch" onclick='return ajax_updt_srch_desc("<?=$hd_parent_sk?>","<?=$hd_parent_url?>")'>Update State Search Description for <?=$url?></button>
    </div>
<?php } ?>
<input type="text" id="hd_parent_sk" name="hd_parent_sk" value="<?=$hd_parent_sk;?>" /><br/>
<input type="text" id="hd_parent_url" name="hd_parent_url" value="<?=$hd_parent_url;?>" />
<div class="container-fluid" style="background-color: #00007d; color:#fff">

    <div id="state_msg"></div>
</div>
</body>
</html>
