Google App Engineで全文検索サーブレットの例
説明は、Search APIの使い方を見てください。
version1.0
web.xml
<servlet>
<servlet-name>SearchTest</servlet-name>
<servlet-class>com.akjava.gae.staticweb.server.SearchTest</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>SearchTest</servlet-name>
<url-pattern>/admin/search</url-pattern>
</servlet-mapping>
最新版のGuavaライブラリーが必要です。 あと、この例では、FileEntity.classを使っています。
SearchTest.java
package com.akjava.gae;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import javax.jdo.PersistenceManager;
import javax.jdo.Query;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import com.google.appengine.api.search.Document;
import com.google.appengine.api.search.Field;
import com.google.appengine.api.search.Index;
import com.google.appengine.api.search.IndexSpec;
import com.google.appengine.api.search.PutException;
import com.google.appengine.api.search.QueryOptions;
import com.google.appengine.api.search.Results;
import com.google.appengine.api.search.ScoredDocument;
import com.google.appengine.api.search.SearchServiceFactory;
import com.google.appengine.api.search.StatusCode;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.base.Stopwatch;
import com.google.common.base.Strings;
import com.google.common.base.Utf8;
import com.google.common.collect.Collections2;
import com.google.common.collect.FluentIterable;
import com.google.common.net.MediaType;
public class SearchTest extends HttpServlet{
/**
*
*/
private static final long serialVersionUID = 1L;
private static final String HTML_DB = "htmldb";
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp)
throws ServletException, IOException {
String mode=req.getParameter("mode");
if(mode==null){
mode="input";
}
if(mode.equals("update")){
doUpdate(req,resp);
}else if(mode.equals("addall")){
doAddAll(req,resp);
}else if(mode.equals("search")){
doSearch(req,resp);
}else if(mode.equals("add")){
doAdd(req,resp);
}else if(mode.equals("clear")){
doClearIndexes(req,resp);
}else if(mode.equals("list")){
doListIndexes(req,resp);
}else if(mode.equals("input")){
doMakeInputHtml(req,resp);
}else{
System.out.println("invalid parameter:"+mode);
}
}
private void doMakeInputHtml(HttpServletRequest req, HttpServletResponse resp) {
String out="<html><body><h1>search</h1><form>"+new Tag("input").attr("type", "hidden").attr("name", "mode").attr("value", "search")+new Tag("input").attr("name", "q")+"<br>"
+new Tag("input").attr("type", "submit")+"<br>"
+new Tag("a").attr("href", "?mode=list").text("List All")
+"|"
+new Tag("a").attr("href", "?mode=update").text("Update")
+"|"
+new Tag("a").attr("href", "?mode=addall").text("Add All")
+"|"
+new Tag("a").attr("href", "?mode=clear").text("Clear")
+"</form></body></html>";
resp.setContentType(MediaType.HTML_UTF_8.toString());
try {
resp.getWriter().print(out);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void doClearIndexes(HttpServletRequest req, HttpServletResponse resp) {
System.out.println("clearIndexes");
IndexSpec indexSpec = IndexSpec.newBuilder().setName(HTML_DB).build();
Index index = SearchServiceFactory.getSearchService().getIndex(indexSpec);
QueryOptions options = QueryOptions.newBuilder()
.setLimit(300).setReturningIdsOnly(true).build();
com.google.appengine.api.search.Query query = com.google.appengine.api.search.Query.newBuilder()
.setOptions(options)
.build("");
Results<ScoredDocument> result =index.search(query);
/*
Iterable<ScoredDocument> toremove=FluentIterable.from(result).filter(Predicates.compose(Predicates.not(new SafeHtmlPathPredicate()),
new Function<ScoredDocument,String>(){
public String apply(ScoredDocument doc){
return doc.getId();
}
}
));
*/
resp.setContentType(MediaType.HTML_UTF_8.toString());
try {
resp.getWriter().print("clear count="+result.getNumberReturned()+"<br>");
for(ScoredDocument doc:result){
resp.getWriter().print(doc.getId()+"<br>");
index.delete(doc.getId());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void doListIndexes(HttpServletRequest req, HttpServletResponse resp) {
System.out.println("listIndexes");
IndexSpec indexSpec = IndexSpec.newBuilder().setName(HTML_DB).build();
Index index = SearchServiceFactory.getSearchService().getIndex(indexSpec);
QueryOptions options = QueryOptions.newBuilder()
.setLimit(300).setReturningIdsOnly(true).build();
com.google.appengine.api.search.Query query = com.google.appengine.api.search.Query.newBuilder()
.setOptions(options)
.build("");
Results<ScoredDocument> result =index.search(query);
resp.setContentType(MediaType.HTML_UTF_8.toString());
try {
resp.getWriter().print("list count="+result.getNumberReturned()+"<br>");
for(ScoredDocument doc:result){
resp.getWriter().print(doc.getId()+"<br>");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static class SafeHtmlPathPredicate implements Predicate<String>{
@Override
public boolean apply(String input) {
if(input.indexOf(".cache.")!=-1){
return false;
}
if(input.endsWith("/hosted.html")){
return false;
}
String fileName=FileNames.asSlash().getFileName(input);
if(fileName.startsWith("_")){
return false;
}
return true;
}
}
private void doAdd(HttpServletRequest req, HttpServletResponse resp) {
String path=req.getParameter("path");
if(path==null){
try {
resp.sendError(400);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
BenchMarkTool benchmark=new BenchMarkTool(System.out);
benchmark.start();
PersistenceManager manager=PMF.get().getPersistenceManager();
FileEntity entity=manager.getObjectById(FileEntity.class, path);
if(entity.getTitle()==null ||entity.getTitle().isEmpty() || entity.getData()==null){
return;
}
benchmark.finish("get-entity:");
String title=entity.getTitle();
String text=new String(entity.getData(),Charsets.UTF_8);
benchmark.start();
Document document = Document.newBuilder().setId(entity.getPath()).addField(Field.newBuilder().setName("text").setHTML(text))
.addField(Field.newBuilder().setName("title").setText(title))
.build();
IndexSpec indexSpec = IndexSpec.newBuilder().setName(HTML_DB).build();
Index index = SearchServiceFactory.getSearchService().getIndex(indexSpec);
try {
index.put(document);
} catch (PutException e) {
if (StatusCode.TRANSIENT_ERROR.equals(e.getOperationResult().getCode())) {
System.out.println("TRANSIENT_ERROR");
}
}
benchmark.finish("add-index:");
resp.setContentType(MediaType.HTML_UTF_8.toString());
try {
resp.getWriter().write("added:"+path);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void doSearch(HttpServletRequest req, HttpServletResponse resp) {
String query=req.getParameter("q");
if(query==null){
return;
}
Stopwatch watch=Stopwatch.createStarted();
Index index=SearchServiceFactory.getSearchService().getIndex(IndexSpec.newBuilder().setName(HTML_DB).build());
watch.stop();
long indexTime=watch.elapsed(TimeUnit.MILLISECONDS);watch.reset();
System.out.println("index-time:"+indexTime);
watch.start();
Results<ScoredDocument> result= index.search(query);
String ret="search="+query+"<hr/>";
for(ScoredDocument doc:result.getResults()){
//int rank=doc.getRank();
String title=doc.getOnlyField("title").getText();
String id=doc.getId();
//resp.getWriter().write(id+","+title+"<br/>");
String alink="<a href='"+id+"'>"+title+"</a>";
ret+=alink+"<br/>";
//ret+="<pre>"+doc.getOnlyField("text").getHTML()+"</pre>";
}
watch.stop();
long t=watch.elapsed(TimeUnit.MILLISECONDS);watch.reset();
System.out.println("search-time(exclude make index):"+t);
resp.setContentType(MediaType.HTML_UTF_8.toString());
try {
resp.getWriter().write("<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF8\"></head><body>");
resp.getWriter().write(ret);
resp.getWriter().write("</body></html>");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
String log;
public static final String SEARCH_INDEX_NAME="lastSearchIndexed";
private void doUpdate(HttpServletRequest req, HttpServletResponse resp){
resp.setContentType(MediaType.HTML_UTF_8.toString());
log="";
PersistenceManager manager=PMF.get().getPersistenceManager();
FileEntity lastModified=null;
try{
lastModified=manager.getObjectById(FileEntity.class,SEARCH_INDEX_NAME);
}catch (Exception e) {
//null that ok
}
long current=System.currentTimeMillis();
if(lastModified==null){
log+="initialized<br>";
lastModified=new FileEntity();
lastModified.setPath(SEARCH_INDEX_NAME);
lastModified.setCdate(current);//do all
lastModified.setMdate(0);//do all first
}
log+="lastmodified"+lastModified.getMdate()+"<br>";
Query query=manager.newQuery(FileEntity.class);
query.setFilter("extension=='html' && mdate>"+lastModified.getMdate());
query.setOrdering("mdate");//old first
//TODO rethink after over
Stopwatch watch=Stopwatch.createStarted();
@SuppressWarnings("unchecked")
List<FileEntity> files= (List<FileEntity>) query.execute();
List<FileEntity> toadd=FluentIterable.from(files).filter(Predicates.compose(new SafeHtmlPathPredicate(),
new Function<FileEntity,String>(){
public String apply(FileEntity doc){
return doc.getPath();
}
}
)).toList();
watch.stop();
System.out.println("execute:"+watch.elapsed(TimeUnit.MILLISECONDS));
watch.reset();
watch.start();
List<Document> docs=FluentIterable.from(toadd).transform(new FileEntityToDocument()).toList();
watch.stop();
System.out.println("convert:"+watch.elapsed(TimeUnit.MILLISECONDS));
watch.reset();
watch.start();
IndexSpec indexSpec = IndexSpec.newBuilder().setName(HTML_DB).build();
Index index = SearchServiceFactory.getSearchService().getIndex(indexSpec);
System.out.println("index-created");
for(int i=0;i<docs.size();i++){
System.out.println(docs.get(i).getId());
index.put(docs.get(i));
}
/*
for (List<Document> partition : Lists.partition(docs, 50)) {
index.put(partition);//max-200
}
*/
watch.stop();
System.out.println("put:"+watch.elapsed(TimeUnit.MILLISECONDS));
List<String> created=FluentIterable.from(toadd).transform(new FileEntityToPath()).toList();
String createdListText=Joiner.on("\n").join(created);
if(created.size()>0){//only
lastModified.setData(createdListText.getBytes(Charsets.UTF_8));
lastModified.setMdate(current);
manager.makePersistent(lastModified);
}
try {
resp.getWriter().write("update-count:"+toadd.size()+"<hr>");
resp.getWriter().write("<pre>"+createdListText+"<pre>");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void doAddAll(HttpServletRequest req, HttpServletResponse resp){
resp.setContentType(MediaType.HTML_UTF_8.toString());
log="";
PersistenceManager manager=PMF.get().getPersistenceManager();
Query query=manager.newQuery(FileEntity.class);
query.setFilter("extension=='html'");
query.setOrdering("cdate");//old first
//TODO rethink after over
BenchMarkTool benchmark=new BenchMarkTool(System.out);
benchmark.start();
@SuppressWarnings("unchecked")
List<FileEntity> files= (List<FileEntity>) query.execute();
benchmark.finish("get file by query:");
benchmark.start();
Collection<FileEntity> safeFiles=Collections2.filter(files,Predicates.compose(new SafeHtmlPathPredicate(),
new Function<FileEntity,String>(){
@Override
public String apply(FileEntity input) {
return input.getPath();
}}
));
benchmark.finish("filter useless:");
benchmark.start();
List<Document> docs=FluentIterable.from(safeFiles).transform(new FileEntityToDocument()).toList();
List<Integer> utfSizes=FluentIterable.from(safeFiles).transform(new FileEntityToUtfSize()).toList();
benchmark.finish("convert:");
IndexSpec indexSpec = IndexSpec.newBuilder().setName(HTML_DB).build();
Index index = SearchServiceFactory.getSearchService().getIndex(indexSpec);
System.out.println("index-created");
long totalBytes=0;
for(int v:utfSizes){
totalBytes+=v;
}
benchmark.start();
for(Document doc:docs){
index.put(doc);
}
benchmark.finish("put-time:");
/*
for (List<Document> partition : Lists.partition(docs, 50)) {
index.put(partition);//max-200
}
*/
List<String> created=FluentIterable.from(safeFiles).transform(new FileEntityToPath()).toList();
String createdListText=Joiner.on("\n").join(created);
FileEntity lastModified=null;
try{
lastModified=manager.getObjectById(FileEntity.class,SEARCH_INDEX_NAME);
}catch (Exception e) {
//null that ok
}
long current=System.currentTimeMillis();
if(lastModified==null){
log+="initialized<br>";
lastModified=new FileEntity();
lastModified.setPath(SEARCH_INDEX_NAME);
lastModified.setCdate(current);//do all
lastModified.setMdate(0);//do all first
}
lastModified.setData(createdListText.getBytes(Charsets.UTF_8));
lastModified.setMdate(current);
manager.makePersistent(lastModified);
try {
resp.getWriter().write(log+"added count:"+safeFiles.size()+"<hr>");
resp.getWriter().write("total utf-bytes:"+(totalBytes/1024)+"kb"+"<br>");
resp.getWriter().write("<pre>"+createdListText+"<pre>");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//TODO createIndexAll
public class FileEntityToDocument implements Function<FileEntity,Document>{
@Override
public Document apply(FileEntity entity) {
String text=new String(entity.getData(),Charsets.UTF_8);
Document document = Document.newBuilder().setId(entity.getPath()).addField(Field.newBuilder().setName("text").setHTML(text))
.addField(Field.newBuilder().setName("title").setText(entity.getTitle()))
.build();
return document;
}
}
public class FileEntityToUtfSize implements Function<FileEntity,Integer>{
@Override
public Integer apply(FileEntity entity) {
return Utf8.encodedLength(entity.getPath()+entity.getTitle())+entity.getData().length;
}
}
public class FileEntityToPath implements Function<FileEntity,String>{
@Override
public String apply(FileEntity entity) {
return entity.getPath();
}
}
public static class BenchMarkTool {
private Stopwatch stopwatch;
private PrintStream stream;
public BenchMarkTool(PrintStream stream){
this.stopwatch=Stopwatch.createUnstarted();
this.stream=stream;
}
public void start(){
stopwatch.start();
}
public void finish(String text){
stopwatch.stop();
stream.println(text+" "+stopwatch.elapsed(TimeUnit.MILLISECONDS)+" ms,"+TimeUnit.MINUTES+" minute");
stopwatch.reset();
}
}
public static class Tag {
private String name;
private boolean singleTag;
private String specialEnd=null;//for selected
private List<Tag> childrens=new ArrayList<Tag>();
private Tag parent;
public Tag getParent() {
return parent;
}
public void setParent(Tag parent) {
this.parent = parent;
}
public List<Tag> getChildrens() {
return childrens;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public boolean isSingleTag() {
return singleTag;
}
public void setSingleTag(boolean singleTag) {
this.singleTag = singleTag;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public Map<String, String> getAttributes() {
return attributes;
}
public String getAttribute(String key){
return attributes.get(key);
}
public void setAttributes(Map<String, String> attributes) {
this.attributes = attributes;
}
private String text;
private Map<String,String> attributes=new LinkedHashMap<String, String>();
public Tag(String name){
this.name=name;
}
public void addChild(Tag tag){
childrens.add(tag);
tag.setParent(this);
}
public Tag attr(String name,int value){
setAttribute(name,""+ value);
return this;
}
public Tag attr(String name,String value){
setAttribute(name, value);
return this;
}
public Tag single(){
setSingleTag(true);
return this;
}
public Tag text(String text){
setText(text);
return this;
}
public void setAttribute(String name){
setAttribute(name,name);
}
public void setAttribute(String name,String value){
attributes.put(name, value);
}
public void setId(String id){
setAttribute("id", id);
}
public void setClass(String clasz){
setAttribute("class", clasz);
}
public String getStartTagText(){
StringBuffer buffer=new StringBuffer();
buffer.append("<"+name);
for(String attr:attributes.keySet()){
String value=attributes.get(attr);
if(value.indexOf('"')!=-1){
value=value.replace("\"", """);
}
buffer.append(" "+attr+"=\""+value+"\"");
}
if(specialEnd!=null){
buffer.append(" "+specialEnd);
}
if(singleTag){
buffer.append("/>");
}else{
buffer.append(">");
}
return buffer.toString();
}
public String getEndTagText(){
if(isSingleTag()){
return "";
}
return "</"+name+">";
}
public String toString(){
if(singleTag){
return getStartTagText();
}else{
StringBuffer buffer=new StringBuffer();
buffer.append(getStartTagText());
if(text!=null){
buffer.append(text);
}
for(Tag tag:childrens){
buffer.append("\n"+tag.toString());
}
buffer.append(getEndTagText());
return buffer.toString();
}
}
public String getSpecialEnd() {
return specialEnd;
}
public void setSpecialEnd(String specialEnd) {
this.specialEnd = specialEnd;
}
}
public static class FileNames {
public static final char SLASH='/';
private char fileSeparator;
public static FileNames asSlashFileName=new FileNames(SLASH);
private FileNames(char fileSeparator){
this.fileSeparator=fileSeparator;
}
public boolean isEndsWithFileSeparator(String path){
return path.charAt(path.length()-1)==fileSeparator;
}
public static String addEndWithSeparator(String string,char separator){
if(Strings.isNullOrEmpty(string)){
return string;
}
if(string.charAt(string.length()-1)==separator){
return string;
}else{
return string+separator;
}
}
public static String removeStartWithSeparator(String string,char separator){
if(Strings.isNullOrEmpty(string)){
return string;
}
if(string.charAt(0)==separator){
return string.substring(1);
}else{
return string;
}
}
public static FileNames asSlash(){
return asSlashFileName;
}
/**
* i'm not sure why i choose method name "as.
* @param fileSeparator
* @return
*/
public static FileNames as(char fileSeparator){
return new FileNames(fileSeparator);
}
public boolean hasExtension(String path){
String ext=getFileName(path);
return ext.indexOf(".")!=-1;
}
public String getFileName(String path){
int last=path.lastIndexOf(fileSeparator);
if(last!=-1){
return path.substring(last+1);
}else{
return path;
}
}
/**
* this method have bugs ignore folder have .
* @param name
* @return
*/
public static String getExtension(String name){
String ext;
if(name.lastIndexOf(".")==-1){
ext="";
}else{
int index=name.lastIndexOf(".");
ext=name.substring(index+1,name.length());
}
return ext;
}
/**
* not support directory name contain .
* @param name
* @return
*/
public static String getRemovedExtensionName(String name){
String baseName;
if(name.lastIndexOf(".")==-1){
baseName=name;
}else{
int index=name.lastIndexOf(".");
baseName=name.substring(0,index);
}
return baseName;
}
public String getChangedExtensionName(String path,String extension){
if(hasExtension(path)){
String removed=getRemovedExtensionName(path);
return removed+"."+extension;
}else{
return path;
}
}
public String getIndexedPath(String path,String indexName){
String extension=getExtension(path);
if(extension.isEmpty()){
if(!path.endsWith(""+fileSeparator)){
path+=fileSeparator;
}
return path+indexName;
}
return path;
}
/**
* technically not filename,TODO make urls
* @param path
* @return
*/
public String getRemovedDomainName(String path){
int s=path.indexOf("://");
if(s!=-1){
int n=path.indexOf("/",s+"://".length());
if(n==-1){
return "";
}else{
return path.substring(n);
}
}
return path;
}
/**
*
* @param path
* @param isNoExtensionIsDir recognie filename which has no extension as folder
* @return
*/
public String getDirectoryPath(String path,boolean isHandleNoExtensionFileAsDir){
return getDirectoryPath(path,isHandleNoExtensionFileAsDir,true);
}
/**
*
* @param path
* @param isHandleNoExtensionFileAsDir
* @param noDirContainAsDirectory
* the case path is "name" ,if true return "name":false return ""
* @return
*/
public String getDirectoryPath(String path,boolean isHandleNoExtensionFileAsDir,boolean noDirContainAsDirectory){
String extension=getExtension(path);
if(extension.isEmpty() && (path.endsWith(""+fileSeparator) || isHandleNoExtensionFileAsDir)){
if(path.endsWith(""+fileSeparator)){
return path;
}else{
return path+fileSeparator;
}
}else{
int last=path.lastIndexOf(fileSeparator);
if(last!=-1){
return path.substring(0,last+1);
}else{
if(noDirContainAsDirectory){
return path;
}else{
return "";//root
}
}
}
}
}
}