-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.php
88 lines (73 loc) · 2.47 KB
/
scraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
<?php
/*
This class is responsible for the screen scraping.
*/
include_once('simple_html_dom.php');
class BetterTMS_Scraper
{
private $mainURL;
function __construct($mainURL)
{
$this->mainURL = $mainURL;
}
//scrapes the main TMS page for the terms available and returns the HTML DOM tree
function scrapeTerms()
{
$ch = curl_init();
//curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $this->mainURL);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$termHtml = curl_exec($ch);
curl_close($ch);
return str_get_html($termHtml);
}
function scrapeSubjects($termLinks)
{
$ckfile = tempnam ("/tmp", "CURLCOOKIE");
$ch = curl_init();
curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $this->mainURL);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_exec($ch);
curl_close($ch);
$ch = curl_init();
curl_setopt($ch, CURLOPT_COOKIEFILE, $ckfile);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $termLinks);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$coursedata = curl_exec($ch);
curl_close($ch);
return str_get_html($coursedata);
}
//should assume $link is array and iterate over all values
function scrapeCourses($link, $term)
{
$ckfile = tempnam ("/tmp", "CURLCOOKIE");
$ckfile2 = tempnam ("/tmp", "CURLCOOKIE2");
$ch = curl_init();
curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $this->mainURL);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_exec($ch);
curl_close($ch);
$ch = curl_init();
curl_setopt($ch, CURLOPT_COOKIEFILE, $ckfile);
curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile2);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $term['termlink']);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_exec($ch);
curl_close($ch);
$ch = curl_init();
curl_setopt($ch, CURLOPT_COOKIEFILE, $ckfile2);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_URL, $link);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
$data = curl_exec($ch);
curl_close($ch);
return str_get_html($data);
}
}