package data.file; import java.io.File; import java.io.IOException; import java.util.Collections; import java.util.Comparator; import java.util.Hashtable; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.regex.Pattern; import org.apache.commons.io.FileUtils; import data.domain.Paper; import data.domain.Reference; public class Statistics { private String suffix; private String encoding; private String separator; private String id; private String sourceName; private String englishName; private String sourceAuthor; private String type; private String fund; private String journal; private String firstOrg; private String orgName; private String category; private String firstAuthor; private String isbn; private String yearVolume; private String keywords; private String fundType; private String reference; private List<Paper> papers; // public static void main(String[] args) { // File file = new File("D:\\sample"); // File[] files = file.listFiles(); // for(File f : files){ // f.delete(); // try { // FileUtils.forceDelete(f); // } catch (IOException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // } // System.out.print("sdfd"); // System.out.print(System.getProperty("line.separator")); // System.out.print("asdfaf"); // } public boolean analyzeReference(String fileAbsPath){ papers = new LinkedList<Paper>(); File file = new File(fileAbsPath); if(!file.exists()){ System.err.println(fileAbsPath + " is not exist, please check the file first!"); return false; } List<String> fileContents = null; try { fileContents = FileUtils.readLines(file, encoding); } catch (IOException e) { e.printStackTrace(); System.err.println("Read " + fileAbsPath + " failed, please check the file first!"); return false; } String aId = null; String aSourceName = null; String aEnglishName = null; String aSourceAuthor = null; String aType = null; String aFund = null; String aJournal = null; String aFirstOrg = null; String aOrgName = null; String aCategory = null; String aFirstAuthor = null; String aIsbn = null; String aYearVolume = null; String aKeywords = null; String aFundType = null; List<Reference> aReference = new LinkedList<Reference>(); boolean referenceStart = false; boolean referenceEnd = false; String rId = null; String rTitle = null; String rCity = null; String rPublish = null; String rJournal = null; String rYear = null; String rMonth = null; for(String line : fileContents){ if(line.equals("")){ continue; } if(line.trim().startsWith(id)){ aId = line.trim().substring(id.length()); referenceStart = false; referenceEnd = false; continue; } if(line.trim().startsWith(sourceName)){ aSourceName = line.trim().substring(sourceName.length()); continue; } if(line.trim().startsWith(englishName)){ aEnglishName = line.trim().substring(englishName.length()); continue; } if(line.trim().startsWith(sourceAuthor)){ aSourceAuthor = line.trim().substring(sourceAuthor.length()); continue; } if(line.trim().startsWith(type)){ aType = line.trim().substring(type.length()); continue; } if(line.trim().startsWith(fund)){ aFund = line.trim().substring(fund.length()); continue; } if(line.trim().startsWith(journal)){ aJournal = line.trim().substring(journal.length()); continue; } if(line.trim().startsWith(firstOrg)){ aFirstOrg = line.trim().substring(firstOrg.length()); continue; } if(line.trim().startsWith(orgName)){ aOrgName = line.trim().substring(orgName.length()); continue; } if(line.trim().startsWith(category)){ aCategory = line.trim().substring(category.length()); continue; } if(line.trim().startsWith(firstAuthor)){ aFirstAuthor = line.trim().substring(firstAuthor.length()); continue; } if(line.trim().startsWith(isbn)){ aIsbn = line.trim().substring(isbn.length()); continue; } if(line.trim().startsWith(yearVolume)){ aYearVolume = line.trim().substring(yearVolume.length()); continue; } if(line.trim().startsWith(keywords)){ aKeywords = line.trim().substring(keywords.length()); continue; } if(line.trim().startsWith(fundType)){ aFundType = line.trim().substring(fundType.length()); continue; } if(line.trim().startsWith(reference)){ referenceStart = true; referenceEnd = false; continue; } if(line.trim().startsWith(separator)){ referenceStart = false; referenceEnd = true; Paper paper = new Paper(aId, aSourceName, aEnglishName, aSourceAuthor, aType, aFund, aJournal, aFirstOrg, aOrgName, aCategory, aFirstAuthor, aIsbn, aYearVolume, aKeywords, aFundType, aReference ); papers.add(paper); aReference.clear(); } if(referenceStart && !referenceEnd){ String[] refs = line.split("\\."); Reference ref = null; if(refs.length == 4){ rCity = null; rPublish = null; if(refs[3].contains(":")){ String[] cp = refs[3].split("\\:"); switch(cp.length){ case 0: break; case 1: rCity = refs[3].split("\\:")[0]; break; case 2: rCity = refs[3].split("\\:")[0]; rPublish = refs[3].split("\\:")[1]; break; default: break; } } ref = new Reference(refs[0], refs[1], refs[2], rCity, rPublish, null); } if(refs.length == 6){ ref = new Reference(refs[0], refs[1], refs[2], rCity, rPublish, refs[3], refs[4], refs[5]); } aReference.add(ref); } } int count = 0; Map<String, Paper> unique = new Hashtable<String, Paper>(); Map<String, Integer> duplicate = new Hashtable<String, Integer>(); for(Paper paper : papers){ if(paper == null || paper.getReference().isEmpty() ){ continue; } for(Reference ref : paper.getReference()){ if(ref == null){ continue; } if(unique.containsKey(ref.getTitle())){ if(duplicate.containsKey(ref.getTitle())){ count = duplicate.get(ref.getTitle()) +1; duplicate.put(ref.getTitle(), count); }else{ duplicate.put(ref.getTitle(), 2); } }else{ unique.put(ref.getTitle(), paper); } } } List<String> totalReference = new LinkedList<String>(); totalReference.add("所有文献中共有"+unique.keySet().size()+"个不重复的参考文献,名单如下:"); Set<String> refTitles = unique.keySet(); Iterator<String> it = refTitles.iterator(); int no = 1; while(it.hasNext()){ totalReference.add(String.valueOf(no) +". " + it.next()); no += 1; } DuplicateValueComparator bvc = new DuplicateValueComparator(duplicate); TreeMap<String,Integer> sortedDuplicate = new TreeMap<String,Integer>(bvc); sortedDuplicate.putAll(duplicate); totalReference.add(System.getProperty("line.separator")); totalReference.add("所有文献中共有"+sortedDuplicate.keySet().size()+"个重复的参考文献,名单如下:"); refTitles = sortedDuplicate.keySet(); it = refTitles.iterator(); no = 1; String key = null; while(it.hasNext()){ key = it.next(); totalReference.add(String.valueOf(no) +". " + key + "\t" + sortedDuplicate.get(key)); no += 1; } try { String resultFileName = fileAbsPath; resultFileName = resultFileName.substring(0, resultFileName.indexOf(suffix)) + ".reference.text"; FileUtils.writeLines(new File(resultFileName), encoding,totalReference); totalReference.clear(); } catch (IOException e) { System.err.println("Can not write statistics results into file, please check if there has enough disk space."); return false; } return true; } public boolean analyzeFund(String fileAbsPath){ if(papers == null || papers.isEmpty()){ analyzeReference(fileAbsPath); } List<String> totalFunds = new LinkedList<String>(); int no = 1; for(Paper paper : papers){ if(paper.getFund() == null || paper.getFund().trim().equals("")){ continue; } totalFunds.add(no + ". " +paper.getSourceName()); no += 1; } totalFunds.add(0, totalFunds.size()+"个文章有基金项目,这些文章的来源篇名如下:"); try { String resultFileName = fileAbsPath; resultFileName = resultFileName.substring(0, resultFileName.indexOf(suffix)) + ".fund.text"; FileUtils.writeLines(new File(resultFileName), encoding, totalFunds); totalFunds.clear(); } catch (IOException e) { System.err.println("Can not write statistics results into file, please check if there has enough disk space."); return false; } return true; } public boolean analyzeFirstOrganization(String fileAbsPath){ if(papers == null || papers.isEmpty()){ analyzeFund(fileAbsPath); } int count = 0; Map<String, Integer> firstOrganizations = new Hashtable<String, Integer>(); for(Paper paper : papers){ if(paper == null || paper.getFirstOrg().trim().equals("")){ continue; } if(firstOrganizations.containsKey(paper.getFirstOrg())){ count = firstOrganizations.get(paper.getFirstOrg()) +1; firstOrganizations.put(paper.getFirstOrg(), count); }else{ firstOrganizations.put(paper.getFirstOrg(), 1); } } List<String> totalFirstOrganizations = new LinkedList<String>(); totalFirstOrganizations.add("按第一机构发文量进行排序如下:"); Map sortedDuplicate = sortByValue(firstOrganizations); Iterator<String> it = sortedDuplicate.keySet().iterator(); int no = 1; String key = null; while(it.hasNext()){ key = it.next(); totalFirstOrganizations.add(String.valueOf(no) +". " + key + "\t" + sortedDuplicate.get(key)); no += 1; } try { String resultFileName = fileAbsPath; resultFileName = resultFileName.substring(0, resultFileName.indexOf(suffix)) + ".1stOrganization.text"; FileUtils.writeLines(new File(resultFileName), encoding,totalFirstOrganizations); totalFirstOrganizations.clear(); } catch (IOException e) { System.err.println("Can not write statistics results into file, please check if there has enough disk space."); return false; } return true; } public boolean analyzeYearDiffBetweenJournalAndReferences(String fileAbsPath){ if(papers == null || papers.isEmpty()){ analyzeFirstOrganization(fileAbsPath); } Pattern pattern = Pattern.compile("[0-9]*"); int journalYear = 0; int referenceYear = 0; String yearTemp = null; List<String> yearDiff = new LinkedList<String>(); yearDiff.add("发文期刊与参考文献期刊的年代差如下:"); for(Paper paper : papers){ if(paper.getReference().isEmpty()){ continue; } if(paper.getYearVolume() == null || paper.getYearVolume().trim().equals("") || !paper.getYearVolume().contains(",")){ continue; } yearDiff.add(id+paper.getId()); journalYear = Integer.parseInt(paper.getYearVolume().trim().substring(0, 4)); for(Reference ref : paper.getReference()){ if(ref == null || ref.getYear() == null || ref.getYear().trim().equals("")){ continue; } if(ref.getYear().contains(".")){ yearTemp = ref.getYear().substring(0, ref.getYear().indexOf(".")); if(yearTemp.length() == 2){ yearTemp = "20"+yearTemp; } referenceYear = Integer.parseInt(yearTemp); }else{ if(pattern.matcher(ref.getYear()).matches()){ if(ref.getYear().length() == 2){ referenceYear = Integer.parseInt("20"+ref.getYear()); }else{ referenceYear = Integer.parseInt(ref.getYear()); } }else{ System.err.println("Error in year format("+ref.getYear()+") of reference for "+ref.getTitle()); continue; } } yearDiff.add(paper.getSourceName() +" - " + ref.getTitle() +" = " +journalYear +" - "+ referenceYear + " = " +(journalYear - referenceYear)); } yearDiff.add(System.getProperty("line.separator")); } try { String resultFileName = fileAbsPath; resultFileName = resultFileName.substring(0, resultFileName.indexOf(suffix)) + ".yearDiffBetweenJournalAndReference.text"; FileUtils.writeLines(new File(resultFileName), encoding,yearDiff); yearDiff.clear(); } catch (IOException e) { System.err.println("Can not write statistics results into file, please check if there has enough disk space."); return false; } return true; } public static Map sortByValue(Map map) { List list = new LinkedList(map.entrySet()); Collections.sort(list, new Comparator() { public int compare(Object o1, Object o2) { return ((Comparable) ((Map.Entry) (o2)).getValue()) .compareTo(((Map.Entry) (o1)).getValue()); } }); Map result = new LinkedHashMap(); for (Iterator it = list.iterator(); it.hasNext();) { Map.Entry entry = (Map.Entry) it.next(); result.put(entry.getKey(), entry.getValue()); } return result; } public String getSuffix() { return suffix; } public void setSuffix(String suffix) { this.suffix = suffix; } public String getEncoding() { return encoding; } public void setEncoding(String encoding) { this.encoding = encoding; } public String getSeparator() { return separator; } public void setSeparator(String separator) { this.separator = separator; } public String getId() { return id; } public void setId(String id) { this.id = id; } public String getSourceName() { return sourceName; } public void setSourceName(String sourceName) { this.sourceName = sourceName; } public String getEnglishName() { return englishName; } public void setEnglishName(String englishName) { this.englishName = englishName; } public String getSourceAuthor() { return sourceAuthor; } public void setSourceAuthor(String sourceAuthor) { this.sourceAuthor = sourceAuthor; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getFund() { return fund; } public void setFund(String fund) { this.fund = fund; } public String getJournal() { return journal; } public void setJournal(String journal) { this.journal = journal; } public String getFirstOrg() { return firstOrg; } public void setFirstOrg(String firstOrg) { this.firstOrg = firstOrg; } public String getOrgName() { return orgName; } public void setOrgName(String orgName) { this.orgName = orgName; } public String getCategory() { return category; } public void setCategory(String category) { this.category = category; } public String getFirstAuthor() { return firstAuthor; } public void setFirstAuthor(String firstAuthor) { this.firstAuthor = firstAuthor; } public String getIsbn() { return isbn; } public void setIsbn(String isbn) { this.isbn = isbn; } public String getYearVolume() { return yearVolume; } public void setYearVolume(String yearVolume) { this.yearVolume = yearVolume; } public String getKeywords() { return keywords; } public void setKeywords(String keywords) { this.keywords = keywords; } public String getFundType() { return fundType; } public void setFundType(String fundType) { this.fundType = fundType; } public String getReference() { return reference; } public void setReference(String reference) { this.reference = reference; } }
最近浏览更多
qiuyuqiuyuqiuyu
2023年5月25日
暂无贡献等级
Irene777 LV1
2022年2月24日
2898369623 LV1
2021年10月12日
1798672867 LV21
2021年7月18日
哎呀 LV1
2021年5月15日
2018ly
2021年5月13日
暂无贡献等级
FshfshFsh LV2
2021年3月7日
2196316269 LV10
2021年2月24日
litaosb LV5
2020年12月14日
532069753 LV3
2020年5月1日