代码片段
多级链接跳转
- 代码结构示意图
- 代码:
import Base.Iterators.Stateful
import Base.Filesystem.filesize
import Base.Filesystem.walkdir
import Base.Filesystem.rm
using Logging
#=========================================
网页内容:分类提取:
类别:超链接,文本,图像<img>,其他
class:1」 2」 3」 4」
======================================#
function get_context_from_html!(file_name,url_list,Pattern,class=1)
#file|iostream => link_list
#change place : url_list
tmp=""
try
for line in eachline(file_name)
if occursin(Pattern,line)
m=match(Pattern,line)
tmp=m.match
if class==1 && length(tmp)>10
#push!(url_list,tmp[7:end-1])
#StringIndexError :变长编码索引困难
push!(url_list,tmp[7:prevind(tmp,lastindex(tmp))])
elseif class==2
push!(url_list,tmp) #暂时如此
elseif class==3 && length(tmp) > 50
push!(url_list,tmp[2:prevind(tmp,lastindex(tmp))])
end
#println(m.match) #查看匹配的内容
end
end
catch e
println(tmp,length(tmp))
throw(e)
end
return url_list
end
#==================================
构建网络上的多级链接跳转
1、map func:link in array-> 文件buffer ->links
2、reduce func: links-> set ->add_to(array)
===============================#
function from_link2links!(url)
fileName="tmp.txt" #下载的html文件缓存
#正则表达式模式定义
#Pattern=r"<img.+?>" #匹配<a></a>
#Pattern=r">[^<a-zA-Z]+?[^>]<" #文本
Pattern=r"href=\"https?.+?\"" #匹配 href=“http :”
new_urls=Set{String}()
#download使用curl
try
run(`curl -g -f --retry 0 -m 10 -o $fileName $url`)#每次写入自动覆盖原有文件内容
#download 无法设置-m 10 ;无法链接的google要等5min
#修改Base.download:没用 -y 10
catch e
@error "错误为:$e "
end
get_context_from_html!(fileName,new_urls,Pattern)
return new_urls
end
function check_url(url)
#进一步处理url
end
function build_htmlSet(dir_name="htmlfile/")
result_file="link_palace.txt" #link 存储位置
URL_list=Array{String,1}()
URL_set=Set{String}() #link集合
for line in eachline(result_file)
push!(URL_list,line)
push!(URL_set,line)
@info line #从本地文件中初始化URL列表
end
@label start
urls_set_list=map(from_link2links!,URL_list )
URL_list=Array{String,1}()
for u_set in urls_set_list
for url in u_set
#reduce url:检测url的质量
#check_url(url)
#url -> set
if url ∉ URL_set
push!(URL_set,url)
push!(URL_list,url)
end
end
end
@info "URL_set的大小为:$(length(URL_set) )"
@info "URL_list的大小为: $(length(URL_list))"
@info "URL_list的随机元素:$(rand(URL_list) )"
flush(io)
if (length(URL_set)<10_0000) && (length(URL_list) > 0)
#结束条件:URL_list为空,或者链接集合达到1k
@goto start
end
@info "结果存入文件"
open(result_file,"w") do io
for node in URL_set
write(io,node*'\n')
end
end
@info "deal_html runing is end! $count"
end
#可用stdout 重定向改变输出到文件 ,但对@info等的输出无效
#io = open("log.txt", "w+") redirect_stdout(io)
io = open("log.txt", "w+")
logger = SimpleLogger(io)
global_logger(logger)
@time build_htmlSet()
flush(io)
close(io)
链接筛选
#========================================
提取URL文件中的首页链接
=====================================#
function cut_url(file_name::String)
main_url=Set{String}()
pattern=r"//.+?/"
for line in eachline(file_name)
#正则表达式形式:
#m=match(pattern,line) @show m.match[3:end-1]
#split函数:
str_list=split(line,"/")
try
url_str=str_list[3]
push!(main_url,url_str)
catch e
@error line
continue
end
end
io=open("tmp.txt","w")
for iter in sort!(collect(main_url) )
write(io,iter*'\n')
end
flush(io)
close(io)
@info "函数cur_url 结束."
end
#==============================
过滤掉url中的 edu,gov项
===========================#
function del_edugov(fileName)
#查看文件中的行数:cat fileName | wc -l
#文件转码:iconv -f ISO-8859-14 -t UTF-8 tmp.txt -o tmp1.txt
url_list=[]
for line in eachline(fileName)
if occursin("edu",line) continue end
if occursin("gov",line) continue end
push!(url_list,line)
end
io=open(fileName,"w")
for iter in url_list
write(io,iter*'\n')
end
flush(io)
close(io)
@info "函数del_edugov 结束。"
end
#cut_url("link_palace.txt")
#del_edugov("tmp1.txt")
遍历ipv4
#===============================================
IP: 255.255.255.255-> 0.0.0.0
尝试从IP地址遍历 网站 [42_9496_7296=256^4]
===========================================#
import Base.Filesystem.filesize
function findAllSite_fromIP()
#io_ip=IOBuffer(read=true,write=true,maxsize=800)
#split(String(take!(io_ip)),'\n')
io_ip=Channel{String}(10000)
#=
#too slow
task=@async begin
for ip1=0:255
for ip2=0:255
for ip3=0:255
for ip4=0:255
IP="$(repr(ip4)).$(repr(ip3)).$(repr(ip2)).$(repr(ip1))"
#write(io_ip,IP*'\n')
put!(io_ip,IP) #放入管道
end
end
end
end
end
=#
task=@async begin
for i=1:(256^4)
#使用4个随机数
IP="$(repr(rand(0:255))).$(repr(rand(0:255))).$(repr(rand(0:255))).$(repr(rand(0:255)))"
put!(io_ip,IP)
end
end
bind(io_ip,task)
@info "开始检测URL..."
count=0
i=1
io=open("tmp2ip_http.txt","w")
for IP in io_ip
count+=1
if count >=1000_0000
count=0
println("$i 个千万ip :$(IP)")
i+=1
end
try
run(`curl -m 10 http://$(IP) -o tmp.txt`)
tmpsize=filesize("tmp.txt")
if tmpsize > 100
write(io,IP*'\n')
flush(io)
end
rm("tmp.txt")
catch e
continue
end
end
close(io)
@info "无效URL数量:$(count)"
end
#findAllSite_fromIP()
#由于速度太慢 几乎不可用
线性同余计算 随机数
function seed(X_0=4678_9298_0981)
m=1000_0000_0000
a=8276_3234_4621
c=3789_0237_8437
X_1=(a*X_0+c)%m
return X_1,m,a,c
end
function linear_rand(num)
X_n,m,a,c=seed()
num_list=Array{Int64,1}()
for i=1:num
#X_nplus1=abs((a*X_n+c)%m)
#X_nplus1=(round(ℯ^c)*round(X_n*pi)+a)%m
#X_nplus1=abs(round(a*tanh(X_n)+c)%m)
#@show X_nplus1
push!(num_list,X_nplus1)
X_n=X_nplus1
end
return num_list
end
import PyPlot.plot
function 使用分布检验随机数生成()
#tmp=sort!(randn(1000))
tmp=sort!(linear_rand(1000))
plot(tmp)
show()
end
使用分布检验随机数生成()
循环与递归
function 阶乘_循环(num::BigInt)
one=1
if num==1 return 1 end
for i=2:num
one*=i
end
return one
end
function 阶乘_递归(num::BigInt)
if num ==1 return 1 end
return num*阶乘_递归(num-1)
end
function test()
asd::BigInt=500
@time result1=阶乘_递归(asd)
@time result2=阶乘_循环(asd)
@assert result1 == result2
#阶乘较优
end
test()