首页>代码>java抓取html网页数据内容demo>/zhaopin/src/com/Dao/Impl/zrdpDaoImpl.java
package com.Dao.Impl;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.springframework.orm.hibernate3.support.HibernateDaoSupport;

import com.Dao.zrdpDao;
import com.Model.Piliang;
import com.Model.Qzpiliang;

/**
 * 转让店铺接口实现类
 * @author Administrator
 *
 */
public class zrdpDaoImpl extends HibernateDaoSupport implements zrdpDao {

	
	/**
	 * 获得店铺信息列表的链接集合  转让
	 */
	public Map<String,String> getUsePageLink(String pageurl,int i,int cityflag){
		URL url=null;
		
			//从抓取内容下截取出含有信息链接地址的海量字符串
			String allLink="";
			//正则表达式匹配出的有效网页链接集合
			Map<String,String> usePage= new HashMap<String,String>();  
			try {
					StringBuffer urls = new StringBuffer(pageurl);
					urls.append(i+"/");
				 url = new URL(urls.toString());
				HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
				InputStreamReader input = new InputStreamReader(httpConn
						.getInputStream(), "utf-8");
				BufferedReader bufReader = new BufferedReader(input);
				String line = "";
				StringBuilder contentBuf = new StringBuilder();
				while ((line = bufReader.readLine()) != null) {
					contentBuf.append(line);
				}
				allLink = contentBuf.toString().substring(contentBuf.indexOf("list-info-title"),contentBuf.lastIndexOf("list-info-title"));
				//Pattern pattern = Pattern.compile("[/fang6/][a-z]*-[0-9]*[.htm]");  //分析抓取页面内容,匹配出有用的网页链接(推广)
				Pattern pattern2 = Pattern.compile("[/fang6/][0-9]*[x.htm]");		//抓取普通网页链接
				Matcher m2 = pattern2.matcher(allLink);
				while (m2.find()) {
					if(m2.group().indexOf("/")!=-1&&m2.group().length()>5){
					usePage.put(m2.group(), m2.group());
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
			return usePage;
		
	}
	
	
	
	
	/**
	 * 获得单条店铺信息   转让
	 */
	public Piliang getShopInfoFromWeb(String hrefs) {
		String strURL = "http://bj.ganji.com/fang6"+ hrefs;
		String allContent="";
		try {
			URL url = new URL(strURL);
			HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
			InputStreamReader input = new InputStreamReader(httpConn
					.getInputStream(), "utf-8");
			BufferedReader bufReader = new BufferedReader(input);
			String line = "";
			StringBuilder contentBuf = new StringBuilder();
			while ((line = bufReader.readLine()) != null) {
				contentBuf.append(line);
			}
			allContent = contentBuf.toString();
		} catch (Exception e) {
			e.printStackTrace();
		}
		Pattern chinese=Pattern.compile("[\u4E00-\u9FA5,。!、\\d\\.%]");  //中文加部分标点符号以及阿拉伯数字
		Pattern realChinese = Pattern.compile("[\u4E00-\u9FA5]");
		Pattern titlePa= Pattern.compile("[\u4E00-\u9FA5,。!、]");  //验证标题
		Pattern nums = Pattern.compile("\\d[\\d\\.]*");
		Matcher m ;
		String title = allContent.substring(allContent.indexOf("title-name"),allContent.indexOf("title-info clearfix"));
		StringBuffer  oktitle = new StringBuffer();
		m = titlePa.matcher(title);
		while(m.find()){
			oktitle.append(m.group());
		}
		//System.out.println("标题:"+oktitle);
		
		String area = allContent.substring(allContent.indexOf("所在区域:")+50,allContent.indexOf("所在区域:")+100);
		StringBuffer OKarea = new StringBuffer();
		m=chinese.matcher(area);
		while(m.find()){
			OKarea.append(m.group());
		}
		//System.out.println("区域:"+OKarea);
		
		String address = allContent.substring(allContent.indexOf("商铺地址:"),allContent.indexOf("商铺地址:")+180);
		StringBuffer OKaddress = new StringBuffer();
		m=chinese.matcher(address);
		while(m.find()){
			OKaddress.append(m.group());
		}
		//System.out.println("地址:"+OKaddress.substring(4, OKaddress.length()).substring(0,(OKaddress.length()-4)/2));
		
		String yetai = allContent.substring(allContent.indexOf("适合经营"),allContent.indexOf(" 信息编号:"));
		StringBuffer okyetai = new StringBuffer();
		m = chinese.matcher(yetai);
		while(m.find()){
			okyetai.append(m.group());
		}
		String okrealyetai = okyetai.substring(4,8).toString();
		
		//System.out.println("业态:"+okrealyetai);
		
		
		String mianji =allContent.substring(allContent.indexOf("商铺面积:"),allContent.indexOf("商铺面积:")+50);
		StringBuffer okmianji = new StringBuffer();
		m=nums.matcher(mianji);
		while(m.find()){
			okmianji.append(m.group());
		};
		
		//System.out.println("面积:"+okmianji);
		
		
		String rentMoney = allContent.substring(allContent.indexOf("租金价格:"),allContent.indexOf("租金价格:")+50);
		StringBuffer  okrentmoney = new StringBuffer();
		m=nums.matcher(rentMoney);
		while(m.find()){
			okrentmoney.append(m.group());
		}
		//System.out.println("租金:"+okrentmoney);
		
		
		String userName = allContent.substring(allContent.indexOf("在线联系:"),allContent.indexOf("在线联系:")+150);
		StringBuffer okusername = new StringBuffer();
		m=realChinese.matcher(userName);
		while(m.find()){
			okusername.append(m.group());
		}
		//System.out.println("联系人:"+okusername.substring(4,okusername.length()));
		
		String tel = allContent.substring(allContent.indexOf("联系方式:"),allContent.indexOf("联系方式:")+150);
		StringBuffer oktel = new StringBuffer();
		m=nums.matcher(tel);
		while(m.find()){
			oktel.append(m.group());
		}
		//System.out.println("联系方式:"+oktel);
		
		String maincon = allContent.substring(allContent.indexOf("房源描述:"),allContent.indexOf("联系我时"));
		StringBuffer okmaincon = new StringBuffer();
	   m =chinese.matcher(maincon);
		while (m.find()) {
			okmaincon.append(m.group());
		} 
		//System.out.println("店铺描叙:"+okmaincon);
		
		
		
		
		//抓取出来的数据封装成一个对象,并且返回
		Piliang dp = new Piliang();
		Timestamp createtime = new Timestamp(System.currentTimeMillis());
		dp.setDpxxBti(oktitle.toString());
		dp.setDpxxFbusjian(createtime);
		dp.setDpxxHtaiztai(2);
		dp.setDpxxJtidzhi(OKaddress.substring(4, OKaddress.length()).substring(0,(OKaddress.length()-4)/2).toString());
		dp.setDpxxLxing("门面/底商");
		dp.setDpxxMji(Integer.parseInt(okmianji.toString()));
		String dpmx = okmaincon.toString().substring(4);
		if(dpmx.length()>50){
			dp.setDpxxMxu(dpmx.substring(0,50));
		}
		else{
			dp.setDpxxMxu(dpmx);
		}
		if(okrealyetai.equals("餐饮美食")){
		dp.setDpxxPyetai("餐饮转让");
		dp.setDpxxZyetai("饭店酒楼");
		}
		if(okrealyetai.substring(0,2).equals("酒吧")){
			dp.setDpxxPyetai("休闲娱乐");
			dp.setDpxxZyetai("KTV酒吧");
		}
		if(okrealyetai.equals("洗浴健身")){
			dp.setDpxxPyetai("休闲娱乐");
			dp.setDpxxZyetai("洗浴休闲养生");
		}
		if(okrealyetai.equals("美容美发")){
			dp.setDpxxPyetai("美发美体");
			dp.setDpxxZyetai("美发店");
		}
		if(okrealyetai.equals("服装服饰")){
			dp.setDpxxPyetai("服饰鞋包");
			dp.setDpxxZyetai("服装店");
		}
		if(okrealyetai.equals("休闲娱乐")){
			dp.setDpxxPyetai("休闲娱乐");
			dp.setDpxxZyetai("其它");
		}
		if(okrealyetai.substring(0,3).equals("干洗店")){
			dp.setDpxxPyetai("生活服务");
			dp.setDpxxZyetai("洗衣美鞋店");
		}
		if(okrealyetai.equals("汽车美容")){
			dp.setDpxxPyetai("汽车服务");
			dp.setDpxxZyetai("车饰美容店");
		}
		if(okrealyetai.equals("该商铺出")||okrealyetai.equals("其他")){
			dp.setDpxxPyetai("空铺专柜");
			dp.setDpxxZyetai("空铺出租");
		}
		dp.setDpxxQtaiztai(1);			//前台状态
		if(OKarea.toString().substring(0,2).equals("门头")){
			dp.setDpxxQyu("门头沟");
		}
		if(OKarea.toString().substring(0,2).equals("石景")){
			dp.setDpxxQyu("石景山");
		}
		if(OKarea.toString().substring(0,2).equals("北京")||OKarea.toString().equals("燕郊")){
			dp.setDpxxQyu("其它");
		}
		else{
			dp.setDpxxQyu(OKarea.toString().substring(0,2));	//区域
		}
		dp.setDpxxSfouzdin(2);				//是否置顶,2不置顶
		dp.setDpxxXxilyuan(6);				//信息来源,表示来自于抓取
		dp.setDpxxZjin(Float.parseFloat(okrentmoney.toString()));		//租金
		dp.setDpxxZjinBiaozhi(1);			//租金标志
		dp.setUserName(okusername.substring(4,okusername.length()).toString()); //用户名
		dp.setTel(oktel.toString());                    //联系方式
		dp.setDpxxYhuId(-1);						//信息发送者id ,-1 ,表示前辈
		return dp;
	}
	
	/**
	 * 单条信息入库 转让
	 */
	public boolean intoDatabase(Piliang dpxx) {
		try {
			getSession().save(dpxx);
			return true;
		} catch (Exception e) {
			e.printStackTrace();
			return false;
		}
		}
	
	
//	@SuppressWarnings("unchecked")
//	public List<Zrdpxx> paichongdp(String tel) {
//		return getSession().createSQLQuery("select * from zrdpxx where tel = '"+tel+"'").list();
//	}
	
	
	/**
	 * 获得信息中重复的电话号码     转让
	 */
	@SuppressWarnings("unchecked")
	public List<Object> getSameDpId(int cityflag) {
		return getSession().createSQLQuery("select tel from  piliang where city_flag="+cityflag+"  group by tel  having count(tel)>1 ").list();
	}
	
	/**
	 * 删除号码重复的店铺信息  转让
	 */
	public boolean paichongdp(String tel) {
		try {
			getSession().createSQLQuery("delete from piliang where  tel ='"+tel+"'").executeUpdate();
		} catch (Exception e) {
			e.printStackTrace();
			return false;
		}
		return true;
	}
	
	
	
	
	
	
	
	
	
	
	/**
	 * 获得信息中重复的电话号码     求租
	 */
	@SuppressWarnings("unchecked")
	public List<Object> getSameDpIdQZ(int cityflag) {
		return getSession().createSQLQuery("select tel from qzpiliang where city_flag="+cityflag+" group by tel having count(tel)>1").list();
	}
	
	
	/**
	 * 获得单条店铺信息   求租
	 */
	public Qzpiliang getShopInfoFromWebQZ(String hrefs) {
		String strURL = "http://bj.ganji.com/fang6"+ hrefs;
		String allContent="";
		try {
			URL url = new URL(strURL);
			HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
			InputStreamReader input = new InputStreamReader(httpConn
					.getInputStream(), "utf-8");
			BufferedReader bufReader = new BufferedReader(input);
			String line = "";
			StringBuilder contentBuf = new StringBuilder();
			while ((line = bufReader.readLine()) != null) {
				contentBuf.append(line);
			}
			allContent = contentBuf.toString();
		} catch (Exception e) {
			e.printStackTrace();
		}
		Pattern chinese=Pattern.compile("[\u4E00-\u9FA5,。!、\\d\\.%]");  //中文加部分标点符号以及阿拉伯数字
		Pattern realChinese = Pattern.compile("[\u4E00-\u9FA5]");
		Pattern titlePa= Pattern.compile("[\u4E00-\u9FA5,。!、]");  //验证标题
		Pattern nums = Pattern.compile("\\d[\\d\\.]*");
		Matcher m ;
		String title = allContent.substring(allContent.indexOf("title-name"),allContent.indexOf("title-info clearfix"));
		StringBuffer  oktitle = new StringBuffer();
		m = titlePa.matcher(title);
		while(m.find()){
			oktitle.append(m.group());
		}
		
		//System.out.println("标题:"+oktitle);
		
		String area = allContent.substring(allContent.indexOf("期望区域:")+50,allContent.indexOf("期望地址:"));
		StringBuffer OKarea = new StringBuffer();
		m=realChinese.matcher(area);
		while(m.find()){
			OKarea.append(m.group());
		}
		//System.out.println("区域:"+OKarea);
		
		String address = allContent.substring(allContent.indexOf("期望地址:"),allContent.indexOf("在线联系:"));
		StringBuffer OKaddress = new StringBuffer();
		m=chinese.matcher(address);
		while(m.find()){
			OKaddress.append(m.group());
		}
		//System.out.println("期望地址:"+OKaddress.substring(4, OKaddress.length()).substring(0,(OKaddress.length()-4)/2));
		
		
		
		String mianji =allContent.substring(allContent.indexOf("期望面积:"),allContent.indexOf("期望面积:")+50);
		StringBuffer okmianji = new StringBuffer();
		m=nums.matcher(mianji);
		while(m.find()){
			okmianji.append(m.group());
		};
		//System.out.println("期望面积:"+okmianji);
		
		
		
		String userName = allContent.substring(allContent.indexOf("在线联系:"),allContent.indexOf("在线联系:")+150);
		StringBuffer okusername = new StringBuffer();
		m=realChinese.matcher(userName);
		while(m.find()){
			okusername.append(m.group());
		}
		//System.out.println("联系人:"+okusername.substring(4,okusername.length()));
		
		String tel = allContent.substring(allContent.indexOf("联系方式:"),allContent.indexOf("联系方式:")+150);
		StringBuffer oktel = new StringBuffer();
		m=nums.matcher(tel);
		while(m.find()){
			oktel.append(m.group());
		}
		//System.out.println("联系方式:"+oktel);
		
		String maincon = allContent.substring(allContent.indexOf("房源描述:"),allContent.indexOf("联系我时"));
		StringBuffer okmaincon = new StringBuffer();
	   m =chinese.matcher(maincon);
		while (m.find()) {
			okmaincon.append(m.group());
		} 
		//System.out.println("店铺描叙:"+okmaincon);
		
		
		
		
		//抓取出来的数据封装成一个对象,并且返回
		Qzpiliang dp = new Qzpiliang();
		Timestamp createtime = new Timestamp(System.currentTimeMillis());
		dp.setQzdpxxBti(oktitle.toString());    			//标题
		dp.setQzdpxxFbusjian(createtime);					//发布时间
		dp.setQzdpxxHtaiztai(2);							//后台状态
		dp.setQzdpxxQtaiztai(1);							//前台状态
		dp.setQzdpxxXxidizhi(OKaddress.substring(4, OKaddress.length()).substring(0,(OKaddress.length()-4)/2).toString());
		dp.setQzdpxxZxiaomji(Integer.parseInt(okmianji.toString()));
		dp.setQzdpxxZdamji(Integer.parseInt(okmianji.toString())+500);
		
		String dpmx = okmaincon.toString().substring(4);
		if(dpmx.length()>50){
			dp.setQzdpxxXxiyqiu(dpmx.substring(0,50));
		}
		else{
			dp.setQzdpxxXxiyqiu(dpmx.substring(0,50));
		}
		if(oktitle.indexOf("餐饮")!=-1||oktitle.indexOf("美食")!=-1||oktitle.indexOf("面")!=-1||oktitle.indexOf("汤")!=-1||oktitle.indexOf("饭")!=-1||oktitle.indexOf("快餐")!=-1||oktitle.indexOf("小吃")!=-1){
		dp.setQzdpxxPyetai("餐饮转让");
		dp.setQzdpxxZyetai("饭店酒楼");
		}
		else if(oktitle.indexOf("游泳")!=-1){
			dp.setQzdpxxPyetai("运动健身");
			dp.setQzdpxxZyetai("游泳馆");
		}
		else if(oktitle.indexOf("足疗")!=-1){
			dp.setQzdpxxPyetai("医疗保健");
			dp.setQzdpxxZyetai("足疗店");
		}
		else if(oktitle.indexOf("汽车")!=-1){
			dp.setQzdpxxPyetai("汽车服务");
			dp.setQzdpxxZyetai("车饰美容店");
		}
		else if(oktitle.indexOf("游泳")!=-1){
			dp.setQzdpxxPyetai("运动健身");
			dp.setQzdpxxZyetai("游泳馆");
		}
		else if(oktitle.indexOf("游泳")!=-1){
			dp.setQzdpxxPyetai("运动健身");
			dp.setQzdpxxZyetai("游泳馆");
		}
		else if(oktitle.indexOf("服饰")!=-1||oktitle.indexOf("服装")!=-1){
			dp.setQzdpxxPyetai("服饰鞋包");
			dp.setQzdpxxZyetai("服装店");
		}
		else if(oktitle.indexOf("宾馆")!=-1||oktitle.indexOf("旅馆")!=-1||oktitle.indexOf("公寓")!=-1||oktitle.indexOf("旅店")!=-1){
			dp.setQzdpxxPyetai("宾馆酒店");
			dp.setQzdpxxZyetai("宾馆招待所");
		}
		else {
			dp.setQzdpxxPyetai("空铺专柜");
			dp.setQzdpxxZyetai("空铺出租");
		}
		
		if(OKarea.toString().substring(0,2).equals("门头")){
			dp.setQzdpxxQwangqyu("门头沟");
		}
		if(OKarea.toString().substring(0,2).equals("石景")){
			dp.setQzdpxxQwangqyu("石景山");
		}
		if(OKarea.toString().substring(0,2).equals("北京")||OKarea.toString().equals("燕郊")){
			dp.setQzdpxxQwangqyu("其它");
		}
		else{
			dp.setQzdpxxQwangqyu(OKarea.toString().substring(0,2));	//区域
		}
		dp.setQzdpxxSfouzdin(2);				//是否置顶,2不置顶
		dp.setQzdpxxXxilyuan(6);				//信息来源,表示来自于抓取
		dp.setUserName(okusername.substring(4,okusername.length()).toString()); //用户名
		dp.setTel(oktel.toString());                    //联系方式
		dp.setQzdpxxYhuId(-1);						//信息发送者id ,-1 ,表示前辈
		return dp;
	}
	
	
	/**
	 * 获得店铺信息列表的链接集合  求租
	 */
	public   Map<String, String> getUsePageLinkQZ(String qzpageurl, int i,int cityflag) {
		URL url=null;
		
		//从抓取内容下截取出含有信息链接地址的海量字符串
		String allLink="";
		//正则表达式匹配出的有效网页链接集合
		Map<String,String> usePage= new HashMap<String,String>();  
		try {
				StringBuffer urls = new StringBuffer(qzpageurl);
				urls.append(i+"/");
			 url = new URL(urls.toString());
			HttpURLConnection httpConn = (HttpURLConnection) url.openConnection();
			InputStreamReader input = new InputStreamReader(httpConn
					.getInputStream(), "utf-8");
			BufferedReader bufReader = new BufferedReader(input);
			String line = "";
			StringBuilder contentBuf = new StringBuilder();
			while ((line = bufReader.readLine()) != null) {
				contentBuf.append(line);
			}
			allLink = contentBuf.toString().substring(contentBuf.indexOf("list-info-title"),contentBuf.lastIndexOf("list-info-title"));
			//Pattern pattern = Pattern.compile("[/fang6/][a-z]*-[0-9]*[.htm]");  //分析抓取页面内容,匹配出有用的网页链接(推广)
			Pattern pattern2 = Pattern.compile("[/fang6/][0-9]*[x.htm]");		//抓取普通网页链接
			Matcher m2 = pattern2.matcher(allLink);
			while (m2.find()) {
				if(m2.group().indexOf("/")!=-1&&m2.group().length()>5){
				usePage.put(m2.group(), m2.group());
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return usePage;
	}
	
	
	/**
	 * 单条信息入库   求租
	 */
	public boolean intoDatabaseQZ(Qzpiliang dpxx) {
		try {
			getSession().save(dpxx);
			return true;
		} catch (Exception e) {
			e.printStackTrace();
			return false;
		}
	}
	
	
	/**
	 * 删除号码重复的店铺信息    求租
	 */
	public boolean paichongdpQZ(String tel) {
		try {
			getSession().createSQLQuery("delete from qzpiliang where  tel ='"+tel+"'").executeUpdate();
		} catch (Exception e) {
			e.printStackTrace();
			return false;
		}
		return true;
	}
	
	
	
}
最近下载更多
朱俪的邮件及存储  LV8 3月11日
sunlea  LV20 2022年9月8日
moomin709  LV24 2022年7月25日
1798672867  LV21 2021年8月6日
benbosn  LV15 2020年9月11日
aaa最代码  LV14 2020年7月30日
921117  LV4 2020年4月26日
Marcos  LV1 2020年1月31日
tyyeng  LV18 2020年1月31日
2392975497  LV1 2019年10月17日
最近浏览更多
朱俪的邮件及存储  LV8 3月11日
微信网友_5992582549164032  LV6 2022年12月12日
xiaoxiexie  LV13 2022年11月13日
qwertyuiopzxc  LV1 2022年10月14日
sunlea  LV20 2022年9月8日
moomin709  LV24 2022年7月25日
流眼泪  LV1 2022年3月24日
小熊专属  LV3 2021年12月15日
13798956075  LV1 2021年10月8日
1798672867  LV21 2021年8月6日
顶部 客服 微信二维码 底部
>扫描二维码关注最代码为好友扫描二维码关注最代码为好友