Documentation ¶
Index ¶
- Variables
- func DecodeToUTF8(input []byte, charset encoding.Encoding) (output []byte, err error)
- func EncodeFromUTF8(input []byte, charset encoding.Encoding) (output []byte, err error)
- func ReplaceHTMLCharacterEntities(input string, charset encoding.Encoding) (output string)
- func TransToLocalLink(mainSite string, fullURL string, urlType int) (localLink string, err error)
- func TransToLocalPath(mainSite string, fullURL string, urlType int) (fileDir string, fileName string, err error)
- func URLFilter(fullURL string, urlType int, config *Config) (boolean bool)
- func WriteToLocalFile(baseDir string, fileDir string, fileName string, fileContent []byte) (err error)
- type Config
- type Crawler
- func (crawler *Crawler) EnqueueAsset(req *model.URLRecord)
- func (crawler *Crawler) EnqueuePage(req *model.URLRecord)
- func (crawler *Crawler) GetHTMLPage(num int)
- func (crawler *Crawler) GetStaticAsset(num int)
- func (crawler *Crawler) LoadTaskQueue() (err error)
- func (crawler *Crawler) ParseLinkingAssets(htmlDom *goquery.Document, req *model.URLRecord)
- func (crawler *Crawler) ParseLinkingPages(htmlDom *goquery.Document, req *model.URLRecord)
- func (crawler *Crawler) Start()
Constants ¶
This section is empty.
Variables ¶
var CharsetMap = map[string]encoding.Encoding{ "utf-8": unicode.UTF8, "gbk": simplifiedchinese.GBK, "gb2312": simplifiedchinese.GB18030, "gb18030": simplifiedchinese.GB18030, "big5": traditionalchinese.Big5, }
CharsetMap 字符集映射
var HTMLCharacterEntitiesMap = map[string]string{
"\u00a0": " ",
"©": "©",
"®": "®",
"™": "™",
"¢": "¢",
"£": "£",
"¥": "¥",
"€": "€",
"§": "§",
}
HTMLCharacterEntitiesMap HTML 字符实体
var SpecialCharsMap = map[string]string{
"\\": "xg",
":": "mh",
"*": "xh",
"?": "wh",
"<": "xy",
">": "dy",
"|": "sx",
" ": "kg",
}
SpecialCharsMap 查询参数中的特殊字符
Functions ¶
func DecodeToUTF8 ¶
DecodeToUTF8 从输入的byte数组中按照指定的字符集解析出对应的utf8格式的内容并返回.
func EncodeFromUTF8 ¶
EncodeFromUTF8 将输入的utf-8格式的byte数组中按照指定的字符集编码并返回
func ReplaceHTMLCharacterEntities ¶
ReplaceHTMLCharacterEntities 替换页面中html实体字符, 以免写入文件时遇到不支持的字符
func TransToLocalLink ¶
TransToLocalLink ... @return: localLink 本地链接, 用于写入本地html文档中的link/script/img/a等标签的链接属性, 格式为以斜线/起始的根路径.
func TransToLocalPath ¶
func TransToLocalPath(mainSite string, fullURL string, urlType int) (fileDir string, fileName string, err error)
TransToLocalPath ... @return: 返回本地路径与文件名称, 用于写入本地文件
Types ¶
type Config ¶
type Config struct { // 单个页面中可能包含链接的最大数量 // 用LinkRatioInSinglePage*PageWorkerCount得到PageQueueSize, // 这是为了防止由于队列满而造成worker阻塞引起的列锁, // 但仍然可能由于递归抓取而写满队列 LinkRatioInSinglePage int PageWorkerCount int AssetWorkerCount int SiteDBPath string SitePath string StartPage string MainSite string UserAgent string // 爬取页面的深度, 从1开始计, 爬到第N层为止. // 1表示只抓取单页, 0表示无限制 MaxDepth int // 请求出错最大重试次数(超时也算出错) MaxRetryTimes int OutsiteAsset bool NoJs bool NoCSS bool NoImages bool NoFonts bool BlackList []string }
Config ...
type Crawler ¶
type Crawler struct { PageQueue chan *model.URLRecord // 页面任务队列 AssetQueue chan *model.URLRecord // 静态资源任务队列 Config *Config DBClient *gorm.DB DBClientMutex *sync.Mutex }
Crawler ...
func NewCrawler ¶
NewCrawler 创建Crawler对象
func (*Crawler) EnqueueAsset ¶
EnqueueAsset 页面任务入队列. 入队列前查询数据库记录, 如已有记录则不再接受.
func (*Crawler) EnqueuePage ¶
EnqueuePage 页面任务入队列. 入队列前查询数据库记录, 如已有记录则不再接受. 已进入队列的任务, 必定已经存在记录, 但不一定能成功下载. 由于队列长度有限, 这里可能会阻塞, 最可能发生死锁 每个page worker在解析页面时, 会将页面中的链接全部入队列. 如果此时队列已满, page worker就会阻塞, 当所有worker都阻塞到这里时, 程序就无法继续执行.
func (*Crawler) GetHTMLPage ¶
GetHTMLPage 工作协程, 从队列中获取任务, 请求html页面并解析
func (*Crawler) GetStaticAsset ¶
GetStaticAsset 工作协程, 从队列中获取任务, 获取静态资源并存储
func (*Crawler) LoadTaskQueue ¶
LoadTaskQueue 初始化任务队列, 读取数据库中的`PageTask`与`AssetTask`表, 将其中缓存的任务加载到任务队列中
func (*Crawler) ParseLinkingAssets ¶
ParseLinkingAssets 解析并改写页面中的静态资源链接, 包括js, css, img等元素
func (*Crawler) ParseLinkingPages ¶
ParseLinkingPages 解析并改写页面中的页面链接, 包括a, iframe等元素