代码片段
计算 菲波那切
function fib(n::Int64=10)
#1->40: 69s
@assert n >= 0
if n==0 || n==1
return n==0 ? 0 : 1
else
return fib(n-1) + fib(n-2)
end
end
function fib2(N::Int)
# 1->40: 0.31s
@assert N >= 0
fib_list=[0,1]
if N==0 || N==1
return N==0 ? 0 : 1
else
for i=2:N
push!(fib_list,(fib_list[i]+fib_list[i-1]) )
end
end
return fib_list[end]
end
function test()
for i=1:40
println("fib($i)的结果为 : ",fib2(i) )
end
end
test()
摇奖
好像不太随机,试了几次,一直没抽到“阿呆”
function 开奖()
奖级别=[1,1,2,6] #一等奖一个...
选手名字=Set(["卡卡西","鸣人","佐助","小樱","志乃","新之助","风间彻","妮妮","正男","阿呆"])
中奖名单=Dict() #可以保存名单到文件
println("奖品类型",length(奖级别),"选手数量",length(选手名字))
@assert sum(奖级别)==length(选手名字) "奖品数量和人的数量不匹配!"
N=1
for 奖品数量 in 奖级别
for i = 1:奖品数量
获奖者=rand(选手名字)
中奖名单[获奖者]=N
pop!(选手名字,获奖者)
end
N+=1
end
sort!(中奖名单,by x => values(x) )
for 人 in keys(中奖名单)
println("中奖人:$人 --- : $(中奖名单[人]) 等奖")
end
end
开奖()
给markdown文件添加目录索引
function startwith(head,line)
#println(line[1],head,line[1]==head)
if line[1] == head #第一个字符相同
return true
else
return false
end
end
function sub_head(head_list,need2sub='#')
#使用数字替换掉所有的#
count=1
for line in head_list
line=replace(line, need2sub => "~" )
head_list[count]=line
count+=1
end
end
function 包装(head_list)
local mark_count=1
for line in head_list
head_list[mark_count]="" * line * " \n"
mark_count+=1
end
pushfirst!(head_list,"\n --- \n ")
pushfirst!(head_list,"目录: ")
push!(head_list,"\n --- \n")
end
function add_index2markdown()
#给markdown文件按照头(#)添加目录索引
Markdown_file="/home/asen/文档/war.md"
mark_count=1 #标记#头的出现次数
head_list=[] #存储头内容
new_str=""
open(Markdown_file,"r+") do io
for line in eachline(io)
if length(line) < 2 #避免空行导致的bounderror
continue
end
if startwith('#',line)
push!(head_list,line)
substr=" \n"
new_str *= substr * line * '\n'
mark_count+=1
else
new_str *= line * '\n'
end
end
sub_head(head_list)
包装(head_list)
end
open(Markdown_file,"w") do io
for line in reverse(head_list)
#println(line)
new_str=line * new_str
end
#print(new_str)
write(io,new_str)
end
println("markdown 文件添加目录(页内索引)完成。")
end
add_index2markdown()
# 第二版 : 只提取二级标题
function 包装(head_list,need2sub="##")
local mark_count=1
for line in head_list
line=replace(line, need2sub => "~ $mark_count ~" )
head_list[mark_count]="" * line * "\n\n"
mark_count+=1
end
pushfirst!(head_list,"\n --- \n ")
pushfirst!(head_list,"**目录**: ")
push!(head_list,"\n --- \n")
end
function add_index2markdown()
#给markdown文件按照头(##)添加目录索引
Markdown_file="银行法西斯.md"
mark_count=1 #标记#头的出现次数
head_list=[] #存储头内容
new_str=""
for line in eachline(Markdown_file)
if startswith(line,"##") && (line[3]!='#')
push!(head_list,line)
substr="\n"
new_str *= '\n'*line*substr
mark_count+=1
else
new_str *= line * "\n"
end
end
包装(head_list)
open("tmp.md","w") do io
str=""
[str=str*line for line in head_list]
new_str=str * new_str
write(io,new_str)
end
println("markdown 文件添加目录(页内索引)完成。")
end
add_index2markdown()
熵值计算:Julia版
#============================================
计算熵(Julia version)232s
============================================#
using PyPlot
function compute_entropy()
dir_name="/path/corpus.txt"
word_freq_table=Dict{Char,UInt64}() #时间缩短为57s
Global_entropy,all_num=0,0
@info "读取文件。。。"
open(dir_name,"r") do F
for line in eachline(F) #去掉断行检查时间缩短为47s
for word in line
if word in keys(word_freq_table)
word_freq_table[word]+=1
else
word_freq_table[word]=1
end
end
end
end
word_num=length(word_freq_table)
@show "词频表的长度为$word_num"
all_num=sum(values(word_freq_table)) #sum 与for 性能相近
println("all_num=$all_num 计算熵。。。")
P =freq::UInt64 -> freq/all_num
entropy= prob::Float64 -> -prob*log2(prob)
#=
open("entropy_tmp.txt","w") do io
for (k,v) in sort(collect(word_freq_table),by=x->x[2])
val1=P(v)
val2=entropy(val1)
write(io,"$k \t freq $v \t 概率:$(val1) \t 熵:$(val2) \n")
Global_entropy+=val2
end
end
=#
@info Global_entropy
#=
Global_entropy=sum(
[entropy(P(i)) for i in values(word_freq_table)])
#表达方式更紧凑,没有性能提升
println("the entropy is : ",Global_entropy)
@info ("start plot ...")
tmp=[P(i) for i in values(word_freq_table)]
plot(tmp)
show()
=#
end
#@time compute_entropy()
去掉字符串中的标点的四种方法
import Base.Unicode.ispunct
function del_punct(line)
#去掉标点符号的四种方法
tmp=""
for word in line
#ispunct(word) || (tmp=tmp*word)
#ispunct(word) ? true : tmp=tmp*word
#if !ispunct(word) tmp=tmp*word end
!ispunct(word) && (tmp=tmp*word)
end
return tmp
end
asd="asdfhg34,./,g56uuj678i78ol./p;oyjdthhs.;/lyjfgh"
println(del_punct(asd))
通过存储链接名的文件下载内容
import Base.Iterators.Stateful
import Base.Filesystem.filesize
import Base.Filesystem.walkdir
import Base.Filesystem.rm
function get_PathName_from_dir(dir_name)
#目录名->iter(file_name)
#walkdir :是深度优先搜索
filename_list=[]
function get_filename(dir_one)
for (root,dirs,files) in walkdir(dir_one)
for file in files
push!(filename_list,(root*'/'*file) )
end
end
end
get_filename(dir_name)
return Stateful(filename_list)
end
function get_url_from_file(filename)
#从存储链接的文件中获取:主题名-url
NameUrl=Dict()
for line in eachline(filename)
name,url,other=split(line,['!','?'])
NameUrl[name]=url
end
return Stateful(collect(NameUrl) )
end
function get_context_from_net2()
dir_name="tmp/"
for file_name in get_PathName_from_dir("Baidu")
for (name,url) in get_url_from_file(file_name)
try
download(url,dir_name*name*".html")
catch
continue
end
end
break
end
println("文件下载完成")
end
function get_context_from_net()
proxy=`-x 182.106.140.122:80` #curl 使用代理 -x ip:port
retry=`--retry 7 --retry-delay 10 --retry-max-time 10 ` #重试次数,间隔
#直接通过url无法下载,why?
#command=`wget https://www.baidu.com -O filename`
#暂时使用名字代替
#curl: (56) Received HTTP code 502
#速度太慢,先下5万
for file_name in get_PathName_from_dir("Baidu")
for (name,url) in get_url_from_file(file_name)
command=`curl $(proxy) $(retry) https://baike.baidu.com/item/$(name) -o htmlfile/$(name).html`
try
run(command)
catch
continue
end
end
end
println("文件下载完成")
end
function rm_null_file()
#去掉空文件
dir_name="htmlfile/"
count::Int64=0
for file_name in readdir(dir_name) #默认是本目录
if filesize(dir_name*file_name) < 10
rm(dir_name*file_name)
count+=1
end
end
@info "空文件的数量为: $count ;\n "
end
function main()
get_context_from_net()
rm_null_file()
end
main()
#
# 测试函数
#
function test_get_PathName_from_dir(dir_name)
iter=get_PathName_from_dir(dir_name)
for file in iter
println(file)
end
end
function test_get_url_from_file()
iter=get_url_from_file("Baidu/地理/阿根廷.txt")
count=0
for (N,U) in iter
count+=1
println("名字:$N \t 链接:$U")
if count>10 break end
end
end
function test()
test_get_PathName_from_dir()
test_get_url_from_file()
end
#test()
提取本地目录中html文件的链接
function get_context_from_html!(file_name,url_list,Pattern,class=1)
#change url_list
#class:1=>url ; 2=>picture ;3=>text
tmp=""
for line in eachline(file_name)
if occursin(Pattern,line)
m=match(Pattern,line)
#掐头去尾-> 纯url
tmp=m.match
if class==1 && length(tmp)>10
try
#push!(url_list,tmp[7:end-1])
#StringIndexError :变长编码索引困难
push!(url_list,tmp[7:prevind(tmp,lastindex(tmp))])
catch e
println(tmp,length(tmp))
throw(e)
end
end
#println(m.match) #查看匹配的内容
end
end
return url_list
end
#==================================
将本地目录中的html网页中的链接提取出来
html文件目录 -> superlink文件
===============================#
function deal_html(dir_name="htmlfile/")
#正则表达式模式定义
#Pattern=r"^$" #匹配
Pattern=r"href=\"https?.+?\"" #匹配 href=“https:”
#结构,变量定义
class=1 #class:1=>url ; 2=>picture ;3=>text
result_file="superlink.txt"
fileName_iter=get_PathName_from_dir(dir_name)
url_list=Array{String,1}()
@info "逐个文件处理..."
for fileName in fileName_iter
println(fileName)
get_context_from_html!(fileName,url_list,Pattern,class)
#break
end
#for url in url_list println(url) end
@info "结果存入文件"
open(result_file,"w") do io
for node in Set(url_list)
write(io,node*'\n')
end
end
@info "deal_html runing is end!"
end
deal_html()