sub_types=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get() ) ) );
for (WebElement sub : sub_types) {
if(sub.getText().equals( "科类" ))
continue;
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[2]/div/div/div" );
waitToBeLoaded( driver,path.get() );
//4.Choose subject type
WebElement until5 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until5 );
new Actions( driver ).moveToElement( sub ).click().perform();
//5.Choose betch
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div" );
waitToBeLoaded( driver,path.get() );
WebElement until=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( path.get() ) ) );
//处理异常:element click intercepted: Element betchs=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get() ) ) );
// System.out.println("**********"+betchs.size());
for (WebElement b : betchs) {
try{
sleep( time*2 );
}catch(InterruptedException e){
e.printStackTrace();
}
// System.out.println("*********3");
new Actions( driver ).moveToElement( b ).click().perform();
// System.out.println("********4");
String temp_bet=driver.findElement( By.xpath( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div/div" ) ).getText();
try{
sleep( time*2 );
}catch(InterruptedException e){
e.printStackTrace();
}
// System.out.println("***********5");
if(temp_bet.equals( "批次" ))
continue;
//6.Get page's content
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[1]/table/tbody/tr[2]" );
boolean isEmpty=IsEmptyTds(driver,path.get());
if(!isEmpty)
{
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[3]/div/div/ul/li" );
List do_page=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get()) ) );
Integer last_page_num=do_page.size()-2;
//6.2 for..
do {
//6.3 get..
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[1]/table/tbody/tr[position()>1]" );
//waitToBeLoaded( driver,path.get() );
try{
sleep( 10*time );
}catch(InterruptedException e){
e.printStackTrace();
}
List trs=wait.until( ExpectedConditions.presenceOfAllElementsLocatedBy( By.xpath( path.get() ) ) );
for (WebElement tr : trs) {//G
// et
System.out.println("44444444");
sleep(time*25);
List tds=tr.findElements( By.xpath( "td" ) );
String temp=tds.get( 0 ).getText();
if("-".equals( temp )){
// System.out.println( "遇到‘-’跳出" );
continue;
}
EnrollmentPlan enrollmentPlan=new EnrollmentPlan();
enrollmentPlan.setBatch( temp_bet );
enrollmentPlan.setSchoolId( p );
enrollmentPlan.setProvince( temp_pro );
enrollmentPlan.setSpecialName( temp );
enrollmentPlan.setSubjectType( tds.get( 1 ).getText() );
enrollmentPlan.setSpecialType( tds.get( 2 ).getText() );
String temp_num_1=tds.get( 3 ).getText();
String temp_num_2=tds.get( 4 ).getText();
if(!temp_num_1.equals( "-" )&&temp_num_2.equals( "-" ))
enrollmentPlan.setPlanNum( Long.parseLong( temp_num_1 ) );
if(!temp_num_2.equals( "-" )&&temp_num_1.equals( "-" ))
enrollmentPlan.setPlanNum( Long.parseLong( temp_num_2 ) );
sleep( time );
System.out.println(enrollmentPlan);
lists.add( enrollmentPlan );
}
//下一页
if(do_page.size() > 3){
int next=do_page.size()-1;
System.out.println("5555555555");
sleep(time*25);
//wait.until( ExpectedConditions.elementToBeClickable( do_page.get( next ) ) ).click();
WebElement until2 = do_page.get(next);
driver_js.executeScript("arguments[0].click();",until2);
}
last_page_num--;
} while (last_page_num > 0);
}
//批次跳转
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div" );
waitToBeLoaded( driver,path.get() );
WebElement until2 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript("arguments[0].click();",until2);
}
//批次初始化
if(betchs.size() > 1){
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
path.set( "/html/body/div[1]/div/div/div/div/div/div/div[3]/div[1]/div[1]/div/div[2]/div[1]/div/div[4]/form/div/div/div/span/div/div" );
waitToBeLoaded( driver,path.get() );
WebElement until4 = wait.until(ExpectedConditions.presenceOfElementLocated(By.xpath(path.get())));
driver_js.executeScript( "arguments[0].click();",until4 );
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
path.set( "/html/body/div[7]/div/div/div/ul/li[1]" );
waitToBeLoaded( driver,path.get() );
// System.out.println( "遇到‘-’跳出1" );
WebElement until2=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( path.get() ) ) );
// System.out.println( "遇到‘-’跳出2" );
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
// System.out.println( "遇到‘-’跳出3" );
new Actions( driver ).moveToElement( until2 ).click().perform();
// System.out.println( "遇到‘-’跳出4" );
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
}
//鼠标悬浮事件点击
try{
sleep( time );
}catch(InterruptedException e){
e.printStackTrace();
}
}
//schoolService.saveAllEnrollmentPlan( lists );
}
//7 Save lists
return lists;
}
private boolean IsEmptyTds(WebDriver driver,String s) throws InterruptedException {
WebDriverWait wait=new WebDriverWait( driver,20 );
WebElement tr=wait.until( ExpectedConditions.presenceOfElementLocated( By.xpath( s ) ) );
String td=tr.findElements( By.xpath( "td" ) ).get( 0 ).getText();
return td.equals( "-" );
}
//阻塞
private void waitToBeLoaded(WebDriver driver,String path) {
while (!isLocation( driver,path )) {
try{
//System.out.println(1);
sleep( 500 );
}catch(InterruptedException e){
e.printStackTrace();
}
}
try{
sleep( 500 );
}catch(InterruptedException e){
e.printStackTrace();
}
}
//判断
private boolean isLocation(WebDriver driver,String path) {
try{
sleep( 500 );
driver.findElement( By.xpath( path ) );
System.out.println("定位到:"+path);
return true;
}catch(Exception e){
System.out.println("定位不到:"+path);
return false;
}
}
}
```
- 2. 处理篇
主要使用mysql语句处理已爬取数据,诸如,去重,合并
- 2.1 表去重

- mysql语句:
```
#建立临时表
create table if not exists enrollment_plan_temp select * from enrollment_plan where id<816 order by id ASC;
drop temporary table enrollment_plan_temp;
select * from enrollment_plan_temp;
#建立索引
alter table enrollment_plan_temp add index school_id_index(school_id);
alter table enrollment_plan_temp add index school_province_index(province);
alter table enrollment_plan_temp add index school_plan_num_index(plan_num);
alter table enrollment_plan_temp add index school_batch_index(batch);
alter table enrollment_plan_temp add index school_special_name_index(special_name);
drop table enrollment_plan_temp_unique;
create table if not exists enrollment_plan_temp_unique(
id int(2) primary key auto_increment,
school_id int(2) comment '学校编号',
province varchar(30) comment '生源地',
subject_type varchar(30) comment '科类',
batch varchar(35) comment '批次',
special_name varchar(500) comment '专业名称',
special_type varchar(40) comment '专业门类',
plan_num int(2) comment '招生人数'
);
drop procedure enrollment_plan_temp_unique_pro;
create procedure enrollment_plan_temp_unique_pro()
begin
DECLARE id1 int(2) ;
declare school_id1 int(2) ;
declare province1 varchar(30);
declare subject_type1 varchar(30);
declare batch1 varchar(35);
declare special_name1 varchar(500);
declare special_type1 varchar(40);
declare plan_num1 int(2) ;
declare count_num int(2);
declare plan_num_temp int(2);
declare id_temp int(2);
-- 遍历数据结束标志
DECLARE done INT DEFAULT FALSE;
-- 定义游标
DECLARE cur CURSOR FOR select id,school_id,province,subject_type,batch,special_name,special_type,plan_num FROM enrollment_plan_temp;
-- 将结束标志绑定到游标
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
-- 打开游标
set count_num=0;
set id_temp=0;
OPEN cur;
-- 开始循环
read_loop: LOOP
-- 提取游标里的数据,这里只有一个,多个的话也一样;
FETCH cur INTO id1,school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1;
if done then
leave read_loop;
end if;
-- 业务逻辑
-- 找到重复的
select count(*) into count_num from enrollment_plan_temp_unique t where t.special_name like special_name1 and t.province = province1 and t.school_id=school_id1 ;
if count_num=0 then#表中没有则插入
INSERT into enrollment_plan_temp_unique(school_id,province,subject_type,batch,special_name,special_type,plan_num) VALUES(school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1);
else#重复count_num=1,表中有则更新
begin
update enrollment_plan_temp_unique SET plan_num = plan_num + plan_num1 where id in (select a.id from
(select id
from enrollment_plan_temp_unique
where special_name = special_name1
and province = province1
and school_id = school_id1
) a);
set count_num=0;
end;
end if;
END LOOP;
-- 关闭游标
CLOSE cur;
end;
call enrollment_plan_temp_unique_pro();
```
- 2.2 三表合并
同校,同省,同专业



- mysql语句:
```
#两表合并,表一截取括号前面的于表二字符串匹配
create table if not exists result_college select e.id,e.school_id,e.province,e.special_name,e.subject_type,e.special_type,e.batch,e.plan_num,s.score,s.position_num from enrollment_plan_temp_unique_v3 e ,special_score_temp_unique s where e.school_id=s.school_id and e.province=s.province and e.special_name like concat(substring_index(s.special_name,'(',1),'%');
drop table result_college_unique;
create table if not exists result_college_unique(
id int(2) primary key auto_increment,
school_id int(2) comment '学校编号',
province varchar(30) comment '生源地',
subject_type varchar(30) comment '科类',
batch varchar(35) comment '批次',
special_name varchar(500) comment '专业名称',
special_type varchar(40) comment '专业门类',
plan_num int(2) comment '招生人数',
score int(2) comment '录取分数',
position_num int(2) comment '位次排名'
);
drop procedure result_college_unique_pro;
create procedure result_college_unique_pro()
begin
DECLARE id1 int(2) ;
declare school_id1 int(2) ;
declare province1 varchar(30);
declare subject_type1 varchar(30);
declare batch1 varchar(35);
declare special_name1 varchar(500);
declare special_type1 varchar(40);
declare plan_num1 int(2) ;
declare count_num int(2);
declare score1 int(2);
declare position_num1 int(2);
declare id_temp int(2);
-- 遍历数据结束标志
DECLARE done INT DEFAULT FALSE;
-- 定义游标
DECLARE cur CURSOR FOR select id,school_id,province,subject_type,batch,special_name,special_type,plan_num,score,position_num FROM result_college;
-- 将结束标志绑定到游标
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
-- 打开游标
set count_num=0;
set id_temp=0;
OPEN cur;
-- 开始循环
read_loop: LOOP
-- 提取游标里的数据,这里只有一个,多个的话也一样;
FETCH cur INTO id1,school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1,score1,position_num1 ;
if done then
leave read_loop;
end if;
-- 业务逻辑
-- 找到重复的
select count(*) into count_num from result_college_unique t where t.special_name = special_name1 and t.province = province1 and t.school_id=school_id1 ;
if count_num=0 then#表中没有则插入
INSERT into result_college_unique(school_id,province,subject_type,batch,special_name,special_type,plan_num,score,position_num) VALUES(school_id1,province1,subject_type1,batch1,special_name1,special_type1,plan_num1,score1,position_num1);
else#重复count_num=1,表中有则更新
begin
set count_num=0;
end;
end if;
END LOOP;
-- 关闭游标
CLOSE cur;
end;
call result_college_unique_pro();
delete from result_college_unique where special_type='-';
select r.id from school_special s ,result_college_unique r where s.specials like concat(substring_index(r.special_name,'类',1),'%') or s.specials like concat(r.special_name,'%') and s.school_id=r.school_id;
create table if not exists results(
id int(2) primary key auto_increment,
school_id int(2) comment '学校编号',
province varchar(30) comment '生源地',
subject_type varchar(30) comment '科类',
batch varchar(35) comment '批次',
special_name varchar(500) comment '专业名称',
special_type varchar(40) comment '专业门类',
plan_num int(2) comment '招生人数',
score int(2) comment '录取分数',
position_num int(2) comment '位次排名',
is_priory int(2) comment '一级学科'
);
insert into results(id, school_id, province, subject_type, batch, special_name, special_type, plan_num, score, position_num) select * from result_college_unique;
update results set is_priory=1 where id IN (select r.id from school_special s ,result_college_unique r where s.specials like concat(substring_index(r.special_name,'类',1),'%') or s.specials like concat(r.special_name,'%') and s.school_id=r.school_id);
```
- 2.3 结果集
 |