S.F. Patch 1679972 OAIDCCrosswalk NPE and invalid character fix

git-svn-id: http://scm.dspace.org/svn/repo/trunk@2094 9c30dcfa-912a-0410-8fc2-9e0234be79fd
This commit is contained in:
Graham Triggs
2007-07-25 16:09:22 +00:00
parent aadd02c274
commit 98fc9b8786

View File

@@ -40,6 +40,9 @@
package org.dspace.app.oai; package org.dspace.app.oai;
import java.util.Properties; import java.util.Properties;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.dspace.content.DCValue; import org.dspace.content.DCValue;
import org.dspace.content.Item; import org.dspace.content.Item;
@@ -57,7 +60,12 @@ import ORG.oclc.oai.server.verb.CannotDisseminateFormatException;
*/ */
public class OAIDCCrosswalk extends Crosswalk public class OAIDCCrosswalk extends Crosswalk
{ {
public OAIDCCrosswalk(Properties properties) // Pattern containing all the characters we want to filter out / replace
// converting a String to xml
private static final Pattern invalidXmlPattern =
Pattern.compile("([^\\t\\n\\r\\u0020-\\ud7ff\\ue000-\\ufffd\\u10000-\\u10ffff]+|[&<>])");
public OAIDCCrosswalk(Properties properties)
{ {
super( super(
"http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"); "http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd");
@@ -107,51 +115,38 @@ public class OAIDCCrosswalk extends Crosswalk
element = "creator"; element = "creator";
} }
// Escape XML chars <, > and &
String value = allDC[i].value; String value = allDC[i].value;
// Escape XML chars <, > and &
// Also replace all invalid characters with ' '
if (value != null) if (value != null)
{ {
// remove control unicode char StringBuffer valueBuf = new StringBuffer(value.length());
String temp = value.trim(); Matcher xmlMatcher = invalidXmlPattern.matcher(value.trim());
char[] dcvalue = temp.toCharArray(); while (xmlMatcher.find())
for (int charPos = 0; charPos < dcvalue.length; charPos++) {
{ String group = xmlMatcher.group();
if (Character.isISOControl(dcvalue[charPos]) &&
!String.valueOf(dcvalue[charPos]).equals("\u0009") && // group will either contain a character that we need to encode for xml
!String.valueOf(dcvalue[charPos]).equals("\n") && // (ie. <, > or &), or it will be an invalid character
!String.valueOf(dcvalue[charPos]).equals("\r")) // test the contents and replace appropriately
{
dcvalue[charPos] = ' '; if (group.equals("&"))
} xmlMatcher.appendReplacement(valueBuf, "&amp;");
} else if (group.equals("<"))
value = String.valueOf(dcvalue); xmlMatcher.appendReplacement(valueBuf, "&lt;");
else if (group.equals(">"))
xmlMatcher.appendReplacement(valueBuf, "&gt;");
else
xmlMatcher.appendReplacement(valueBuf, " ");
}
// add bit of the string after the final match
xmlMatcher.appendTail(valueBuf);
metadata.append("<dc:").append(element).append(">").append(
valueBuf.toString()).append("</dc:").append(element).append(">");
} }
// First do &'s - need to be careful not to replace the
// & in "&amp;" again!
int c = -1;
while ((c = value.indexOf("&", c + 1)) > -1)
{
value = value.substring(0, c) + "&amp;"
+ value.substring(c + 1);
}
while ((c = value.indexOf("<")) > -1)
{
value = value.substring(0, c) + "&lt;"
+ value.substring(c + 1);
}
while ((c = value.indexOf(">")) > -1)
{
value = value.substring(0, c) + "&gt;"
+ value.substring(c + 1);
}
metadata.append("<dc:").append(element).append(">").append(
value).append("</dc:").append(element).append(">");
} }
} }