代码片段

多级链接跳转

import Base.Iterators.Stateful
import Base.Filesystem.filesize
import Base.Filesystem.walkdir
import Base.Filesystem.rm
using Logging
#=========================================
网页内容:分类提取:
    类别:超链接,文本,图像<img>,其他
    class:1」    2」    3」       4」
======================================#
function get_context_from_html!(file_name,url_list,Pattern,class=1)
  #file|iostream => link_list
  #change place : url_list
    tmp=""
  try
    for line in eachline(file_name)
        if occursin(Pattern,line)
            m=match(Pattern,line)
            tmp=m.match
            if class==1 && length(tmp)>10
                #push!(url_list,tmp[7:end-1])
                #StringIndexError :变长编码索引困难
                push!(url_list,tmp[7:prevind(tmp,lastindex(tmp))])
            elseif class==2 
                push!(url_list,tmp)  #暂时如此
            elseif class==3 && length(tmp) > 50
                push!(url_list,tmp[2:prevind(tmp,lastindex(tmp))])
            end
            #println(m.match) #查看匹配的内容
        end
    end
  catch e
    println(tmp,length(tmp))
    throw(e)
  end
    return url_list
end
#==================================
    构建网络上的多级链接跳转
        1、map func:link in array-> 文件buffer ->links
        2、reduce func: links-> set ->add_to(array)
===============================#
function from_link2links!(url)
    fileName="tmp.txt" #下载的html文件缓存
#正则表达式模式定义
    #Pattern=r"<img.+?>"  #匹配<a></a>
    #Pattern=r">[^<a-zA-Z]+?[^>]<"  #文本
    Pattern=r"href=\"https?.+?\""  #匹配  href=“http :”
    new_urls=Set{String}()
#download使用curl
  try
    run(`curl -g -f --retry 0 -m 10 -o $fileName $url`)#每次写入自动覆盖原有文件内容
    #download 无法设置-m 10 ;无法链接的google要等5min
    #修改Base.download:没用 -y 10
  catch e
    @error "错误为:$e "
  end
    get_context_from_html!(fileName,new_urls,Pattern)
    return new_urls
end
function check_url(url)
    #进一步处理url

end
function build_htmlSet(dir_name="htmlfile/")
    result_file="link_palace.txt"  #link 存储位置
    URL_list=Array{String,1}()
    URL_set=Set{String}() #link集合

    for line in eachline(result_file)
        push!(URL_list,line)
        push!(URL_set,line)
        @info line  #从本地文件中初始化URL列表
    end
@label start    
    urls_set_list=map(from_link2links!,URL_list )
    URL_list=Array{String,1}()
    for u_set in urls_set_list
        for url in u_set
        #reduce url:检测url的质量
            #check_url(url)
        #url -> set
            if url  URL_set
                push!(URL_set,url)
                push!(URL_list,url)
            end
        end
    end

    @info "URL_set的大小为:$(length(URL_set) )"
    @info "URL_list的大小为: $(length(URL_list))"
    @info "URL_list的随机元素:$(rand(URL_list) )"
    flush(io)
    if (length(URL_set)<10_0000) && (length(URL_list) > 0)
    #结束条件:URL_list为空,或者链接集合达到1k
        @goto start 
    end

  @info "结果存入文件"
    open(result_file,"w") do io
        for node in URL_set
            write(io,node*'\n')
        end
    end
  @info "deal_html runing is end! $count"

end
#可用stdout 重定向改变输出到文件 ,但对@info等的输出无效
#io = open("log.txt", "w+") redirect_stdout(io)
io = open("log.txt", "w+")
logger = SimpleLogger(io)
global_logger(logger)
@time build_htmlSet()
flush(io)
close(io)

链接筛选

#========================================
    提取URL文件中的首页链接
=====================================#
function cut_url(file_name::String)
    main_url=Set{String}()
    pattern=r"//.+?/"

    for line in eachline(file_name)
    #正则表达式形式:
        #m=match(pattern,line) @show m.match[3:end-1]
    #split函数:
        str_list=split(line,"/")
      try
        url_str=str_list[3]
        push!(main_url,url_str)
      catch e
        @error line
        continue
      end
    end
    io=open("tmp.txt","w")
    for iter in sort!(collect(main_url) )
        write(io,iter*'\n')
    end
    flush(io)
    close(io)
    @info "函数cur_url 结束."
end
#==============================
    过滤掉url中的 edu,gov项
===========================#
function del_edugov(fileName)
    #查看文件中的行数:cat fileName | wc -l
    #文件转码:iconv -f ISO-8859-14 -t UTF-8 tmp.txt -o tmp1.txt
    url_list=[]
    for line in eachline(fileName)
        if occursin("edu",line) continue end
        if occursin("gov",line) continue end

        push!(url_list,line)
    end
    io=open(fileName,"w")
    for iter in url_list
        write(io,iter*'\n')
    end
    flush(io)
    close(io)
    @info "函数del_edugov 结束。"
end

#cut_url("link_palace.txt")
#del_edugov("tmp1.txt")

遍历ipv4

#===============================================
    IP: 255.255.255.255-> 0.0.0.0
    尝试从IP地址遍历 网站 [42_9496_7296=256^4]

===========================================#
import Base.Filesystem.filesize
function findAllSite_fromIP()
    #io_ip=IOBuffer(read=true,write=true,maxsize=800)
    #split(String(take!(io_ip)),'\n')
    io_ip=Channel{String}(10000)
#=
#too slow
    task=@async begin
        for ip1=0:255
          for ip2=0:255
            for ip3=0:255
              for ip4=0:255
                IP="$(repr(ip4)).$(repr(ip3)).$(repr(ip2)).$(repr(ip1))"
                #write(io_ip,IP*'\n')
                put!(io_ip,IP) #放入管道
              end
            end
          end
        end
    end
=#
    task=@async begin
        for i=1:(256^4)
            #使用4个随机数
            IP="$(repr(rand(0:255))).$(repr(rand(0:255))).$(repr(rand(0:255))).$(repr(rand(0:255)))"
            put!(io_ip,IP)
        end
    end
    bind(io_ip,task)
@info "开始检测URL..."
    count=0
    i=1
    io=open("tmp2ip_http.txt","w")
    for IP in io_ip
        count+=1
        if count >=1000_0000
        count=0
        println("$i 个千万ip :$(IP)")
        i+=1
      end
      try
        run(`curl -m 10 http://$(IP) -o tmp.txt`)
        tmpsize=filesize("tmp.txt")
        if tmpsize > 100 
            write(io,IP*'\n')
            flush(io)
        end
        rm("tmp.txt")
      catch e
        continue
      end

    end
    close(io)
@info "无效URL数量:$(count)"
end
#findAllSite_fromIP() 
#由于速度太慢 几乎不可用

线性同余计算 随机数

function seed(X_0=4678_9298_0981)
    m=1000_0000_0000
    a=8276_3234_4621
    c=3789_0237_8437
    X_1=(a*X_0+c)%m
    return X_1,m,a,c
end
function linear_rand(num)
    X_n,m,a,c=seed()
    num_list=Array{Int64,1}()
    for i=1:num
        #X_nplus1=abs((a*X_n+c)%m)
        #X_nplus1=(round(ℯ^c)*round(X_n*pi)+a)%m
        #X_nplus1=abs(round(a*tanh(X_n)+c)%m)
        #@show X_nplus1
        push!(num_list,X_nplus1)
        X_n=X_nplus1
    end
    return num_list
end
import PyPlot.plot
function 使用分布检验随机数生成()
    #tmp=sort!(randn(1000))
    tmp=sort!(linear_rand(1000))
    plot(tmp)
    show()
end
使用分布检验随机数生成()

循环与递归

function 阶乘_循环(num::BigInt)
    one=1
    if num==1 return 1 end
    for i=2:num
        one*=i
    end
    return one
end
function 阶乘_递归(num::BigInt)
    if num ==1  return 1 end
    return num*阶乘_递归(num-1)
end
function test()
    asd::BigInt=500
    @time result1=阶乘_递归(asd)
    @time result2=阶乘_循环(asd) 
    @assert result1 == result2
    #阶乘较优
end
test()