游万海/正则表达式/正则表达式附录程序.md · Stata007/StataSX2018

原文参考：https://gitee.com/Stata002/StataSX2018/edit/master/游万海/正则表达式/推文：Stata中的正则表达式和文本分析简介.md
clear
input str3 num str2 name str10 per str6 income
           -1       a          "10%"    "[9747"
            1       b          "62%"    "1,234"
            1       a          "53%"    "938.9"
           -1       c          "48,6%"  "*8344"
            2       d          "58%"    "2398"
           -2       e          "46%"    "-"
           -3       c          "78%"    "53822"
            3       d          "92,2%"  "na"
           -1       e          "65%"    "$28477"
            1       b          "3,6%"   "n/a"
end


gen index = regexm(income,"\[") /*匹配包含[号的*/
gen index1 = regexm(income,"\\$") /*匹配包含$号的*/
gen index2 = regexm(income,"[\$]")
gen index3 = regexm(income,"[$]")
gen index4 = regexm(income,"[`=char(36)']") /*利用charlist查看相应的代码*/
gen index5 = regexm(income,"\*") /*匹配包含*号的*/
gen index6 = regexm(income,"[\*|\[]") /*|表示或者,匹配包含[号或者*号的*/
list 


clear 
input str12 income
"123"
"acb"
"12a"
end
gen index1 = ustrregexm(income,"\D") /*\D表示非数值*/
gen index2 = ustrregexm(income,"\d") /*\D表示数值*/

clear
input str64 income
"the dandelion war 2010"
end
gen make2 = income
gen make3 = income
replace make2 = ustrregexra(make2, "\w", "_")
replace make3 = ustrregexra(make2, "\W", "_")

clear
input str12 income
"abc"
"abc"
"1ab"
"1abc"
"ab_"
"ab"
end
gen index1 = ustrregexm(income,"ab") 
list

gen index2 = ustrregexm(income,"\bab") 
gen index3 = ustrregexm(income,"ab\b") 
gen index4 = ustrregexm(income,"\bab\b") 

clear
input str10 income 
"abc"
"ab"
"aa"
"abcd"
"aad"
"aab123"
"cdf12345"
"123"
"Abc"
end
gen index1 = ustrregexm(income,"[0-9]") /* [0-9] 表示数值*/
gen index2 = ustrregexm(income,"[a-z]") /* [a-z] 表示小写字母*/
gen index3 = ustrregexm(income,"[aeiou]") /* aeiou 表示元音 */
gen index4 = ustrregexm(income,"[^aeiou]") /*括号[]内时,^表示否定，即排除aeiou*/
gen index5 = ustrregexm(income,"[A-Z]") /* [A-Z] 表示大写字母*/


clear
input str10 income 
"abc"
"ab"
"aa"
"abcd"
"aad"
"a1"
"aab123"
"cdf12345"
"123"
end

gen index1 = ustrregexm(income, "[a]{1}") /*{1}表示数量，匹配a，{1}表示1次；这里是greedy匹配，只要出现>=1次就匹配*/
gen index2 = ustrregexm(income, "[a]{2}") 
gen index3 = ustrregexm(income, "[0-9]{2}")
gen index4 = ustrregexm(income, "[0-9]{3}")
gen index5 = ustrregexm(income, "[0-9]{4}")
gen index6 = ustrregexm(income, "[0-9]{1,3}")
gen index7 = ustrregexm(income, "[0-9]{4,5}")
gen index8 = ustrregexm(income, "[0-9]+") /*+表示1次或多次*/
gen index9 = ustrregexm(income, "[0-9]*") /*表示0次或多次*/
gen index10 = ustrregexm(income, "[0-9]?") /*表示0次或1次*/
gen index11 = ustrregexm(income, "^[0-9]") /*^放在括号外表示以...开始*/
gen index12 = ustrregexm(income, "(^[a-z]+)[1]$") /*若只想匹配 **a1**原始，以字母开头，以数字结束*/
gen index13 = ustrregexm(income, "(^[a-z]+)[0-9]$") /*效果等同于上句命令*/


clear
input str10 income 
"abc"
"aB"
"aa"
"abcd"
"Aad"
"a1"
"aab123"
"cdf12345"
"123"
end
gen index1 = ustrregexm(income, "[[:lower:]]") /*小写字母*/
gen index2 = ustrregexm(income, "[[:upper:]]") /*大写字母*/
gen index3 = ustrregexm(income, "[[:digit:]]") /*大写字母*/


clear
input strL report_text
"indication I want this 1 view"
"indication I want this 2 views"
"indications I want this 3 view"
"indications I want this 4 views"
"history I want this 5 view"
"history I want this 6 views"
"xxx I dont want this yyy"
"indication I dont want this either yyy"
"xxx nor this view"
end

gen indication0=regexs(0) if regexm(lower(report_text),"^(indications|indication|history)(.*)(views|view)$") /* .表示匹配任何字符*/
gen indication1=regexs(1) if regexm(lower(report_text),"^(indications|indication|history)(.*)(views|view)$")
gen indication2=regexs(2) if regexm(lower(report_text),"^(indications|indication|history)(.*)(views|view)$")
gen indication3=regexs(3) if regexm(lower(report_text),"^(indications|indication|history)(.*)(views|view)$")
list

clear
input str10 income 
"abc"
"ab"
"aa"
"abcd"
"aad"
"a1"
"aab123"
"cdf12345"
"123"
end
gen index1 = regexm(income, "(^[a-z]+)([0-9]$)") /*^放在括号外表示以...开始*/


clear
input str10 income 
"abc"
"ab"
"aa"
"abcd"
"aad"
"aab123"
"cdf12345"
"123"
end
gen index1 = regexm(income,"[ab]") /*表示匹配a或者b*/
gen index2 = regexm(income,"(a|b)") /*表示匹配a或者b*/
gen index3 = regexm(income,"(ab)") /*表示匹配ab*/


clear
import delimited "https://movie.douban.com/top250?start=25&filter=",delimiters("^") varnames(1) rowrange(3) encoding("UTF-8") clear


**评价人数 net install moss, from(http://fmwww.bc.edu/RePEc/bocode/m)
moss doctypehtml, match("(人评价)") regex  prefix(c_)
gen comment_num = real(regexs(1)) if regexm(doctypehtml,"([0-9]+)") & c_count==1

tempfile comment_num score title year comment
preserve
drop if comment_num==.
keep comment_num
save `comment_num',replace
restore
**use `comment_num',clear

**评分
moss doctypehtml, match("(v:average)") regex prefix(s_)
gen score = real(regexs(1)) if regexm(doctypehtml,"([0-9][\.][0-9])") & s_count==1
preserve
drop if score==.
keep score
save `score',replace
restore
**use `score',clear


**标题
gen title1 = regexs(1) if regexm(doctypehtml,"(\<span(.+)title(.*)\>$)")==1
**gen title6 = regexs(1) if regexm(doctypehtml,"(\<)span(.+)(.*)title(\>)(^&nbsp)(.+)")==1
gen title2 = title1 if regexm(title1,"(\&nbsp)")==1
gen title=(title1~=title2)
gen title_for = doctypehtml if title==1
preserve
drop if title_for==""
drop in 1
keep title_for
split title_for,parse(> <)
keep title_for3
save `title',replace
restore

**年份
gen year= real(regexs(1)) if regexm(doctypehtml,"([0-9][0-9][0-9][0-9])(\&nbsp)") 
preserve
drop if year==.
keep year
save `year',replace
restore

**精选评论
gen comment_text= regexs(0) if regexm(doctypehtml,"(\<)span(.+)inq") 
gen comment = doctypehtml if comment_text~=""
preserve
drop if comment==""
keep comment 
split comment,parse(> 。)
keep comment2
save `comment',replace
restore

use `comment_num',clear
merge 1:1 _n using `year',nogenerate 
merge 1:1 _n using `comment',nogenerate
merge 1:1 _n using `title',nogenerate
merge 1:1 _n using `score',nogenerate

**split comment,parse(> 。)
drop if year==.
list,table
order title_for3 year score comment_num comment2
rename (title_for3 year score comment_num comment2) (电影名称 出版年份 电影评分 评论人数 经典评论)

**爬取多页数据时
tempfile building
save `building', emptyok

scalar web1="https://movie.douban.com/top250?start="
scalar web3="&filter="
forv i = 10(-1)1{
local k = (`i'-1)*25
local url=web1 + "`k'" + web3
*scalar list url
di `"`url'"'
import delimited  "`url'",delimiters("^") varnames(1) rowrange(3) encoding("UTF-8") clear

**评价人数 net install moss, from(http://fmwww.bc.edu/RePEc/bocode/m)
moss doctypehtml, match("(人评价)") regex  prefix(c_)
gen comment_num = real(regexs(1)) if regexm(doctypehtml,"([0-9]+)") & c_count==1

tempfile comment_num score title year comment
preserve
drop if comment_num==.
keep comment_num
save `comment_num',replace
restore

**评分
moss doctypehtml, match("(v:average)") regex prefix(s_)
gen score = real(regexs(1)) if regexm(doctypehtml,"([0-9][\.][0-9])") & s_count==1
preserve
drop if score==.
keep score
save `score',replace
restore


**标题
gen title1 = regexs(1) if regexm(doctypehtml,"(\<span(.+)title(.*)\>$)")==1
**gen title6 = regexs(1) if regexm(doctypehtml,"(\<)span(.+)(.*)title(\>)(^&nbsp)(.+)")==1
gen title2 = title1 if regexm(title1,"(\&nbsp)")==1
gen title=(title1~=title2)
gen title_for = doctypehtml if title==1
preserve
drop if title_for==""
drop in 1
keep title_for
split title_for,parse(> <)
keep title_for3
save `title',replace
restore

**年份
gen year= real(regexs(1)) if regexm(doctypehtml,"([0-9][0-9][0-9][0-9])(\&nbsp)") 
preserve
drop if year==.
keep year
save `year',replace
restore

**精选评论
gen comment_text= regexs(0) if regexm(doctypehtml,"(\<)span(.+)inq") 
gen comment = doctypehtml if comment_text~=""
preserve
drop if comment==""
keep comment 
split comment,parse(> 。)
keep comment2
save `comment',replace
restore

use `comment_num',clear
merge 1:1 _n using `year',nogenerate 
merge 1:1 _n using `comment',nogenerate
merge 1:1 _n using `title',nogenerate
merge 1:1 _n using `score',nogenerate
drop if year==.
append using `building'
save `"`building'"', replace
}
list in 1/10
use `"`building'"',clear
Stata007 / StataSX2018

简介

发行版

贡献者

近期动态

Stata007 / StataSX2018 .gitee-modal { width: 500px !important; }

简介

发行版

贡献者

近期动态

搜索帮助

Stata007 / StataSX2018