Usages of

org.ontoware.rdf2go.model.node.impl.URIImpl.URIImpl(?)
private URIImpl[] convert(ArrayList next) { URIImpl[] out = new URIImpl[next.size()]; for (Object uri : next) { URIImpl uriOut = new URIImpl((String) uri); out[next.indexOf(uri)] = uriOut; } return out; }
public void computeEnhancements(ContentItem ci) throws EngineException { // get model from the extraction URIImpl docId; Model m = null; ci.getLock().readLock().lock(); try { docId = new URIImpl(ci.getUri().getUnicodeString()); m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType()); } catch (ExtractorException e) { throw new EngineException("Error while processing ContentItem " + ci.getUri()+" with Metaxa",e); } catch (IOException e) { throw new EngineException("Error while processing ContentItem " + ci.getUri()+" with Metaxa",e); } finally { ci.getLock().readLock().unlock(); } // Convert the RDF2go model to a Clerezza ImmutableGraph and also extract // the extracted plain text from the model if (null == m) { log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(),ci.getMimeType()); return; } ContentSink plainTextSink; try { plainTextSink = ciFactory.createContentSink("text/plain"); } catch (IOException e) { m.close(); throw new EngineException("Unable to initialise Blob for storing" + "the plain text content",e); } HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>(); RDF2GoUtils.urifyBlankNodes(m); ClosableIterator<Statement> it = m.iterator(); BufferedWriter out = new BufferedWriter(new OutputStreamWriter( plainTextSink.getOutputStream(), UTF8)); boolean textExtracted = false; //used to detect if some text was extracted try { Graph g = new SimpleGraph(); //first add to a temporary graph while (it.hasNext()) { Statement oneStmt = it.next(); //we need to treat triples that provide the plain/text //version differently. Such Objects need to be added to //the plain text Blob! if(oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){ String text = oneStmt.getObject().toString(); if(text != null && !text.isEmpty()){ try { out.write(oneStmt.getObject().toString()); } catch (IOException e) { throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass()+")",e); } textExtracted = true; if (includeText) { BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap); IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap); RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap); g.add(new TripleImpl(subject, predicate, object)); } } } else { //add metadata to the metadata of the contentItem BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap); IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap); RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap); if (null != subject && null != predicate && null != object) { Triple t = new TripleImpl(subject, predicate, object); g.add(t); log.debug("added " + t.toString()); } } } //add the extracted triples to the metadata of the ContentItem ci.getLock().writeLock().lock(); try { ci.getMetadata().addAll(g); g = null; } finally { ci.getLock().writeLock().unlock(); } } finally { it.close(); m.close(); IOUtils.closeQuietly(out); } if(textExtracted){ //add plain text to the content item IRI blobUri = new IRI("urn:metaxa:plain-text:"+randomUUID()); ci.addPart(blobUri, plainTextSink.getBlob()); } }
private DataObject get(String urlString, org.semanticdesktop.aperture.datasource.DataSource source, AccessData accessData, Map params, RDFContainerFactory containerFactory) throws UrlNotFoundException, IOException { String originalUrlString = urlString; URI uri = null; URL url = null; try { url = UrlUtils.normalizeURL(urlString); urlString = url.toExternalForm(); uri = new URIImpl(urlString); } catch (Exception e) { crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Failed); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, urlString, "Invalid URL: " + e.toString())); throw new UrlNotFoundException(e.getMessage()); } ProtocolOutput output = null; Date ifModifiedSince = accessData == null ? null : getIfModifiedSince(urlString, accessData); long modifiedTime = ifModifiedSince != null ? ifModifiedSince.getTime() : 0L; long fetchTime = System.currentTimeMillis(); output = protocol.getProtocolOutput(url, modifiedTime); if (output != null) { ProtocolStatus.Code sCode = output.getStatus().code; if (isRedirected(sCode)) { return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, null, "too many redirections, max = " + protocol.getConfig().getMaxRedirects() + ", url = " + originalUrlString + ", lastUrl=" + urlString); } String[] redirects = output.getMetadata().getValues("X-Proto-Redirects-URL"); if ((redirects != null) && (redirects.length > 0)) { String lastUrl = urlString; try { URIImpl utmp = new URIImpl(lastUrl); } catch (Exception e) { return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, e, "invalid redirection, original url = " + originalUrlString + ", lastUrl=" + lastUrl); } for (String redirect : redirects) { if (accessData != null) { accessData.remove(lastUrl, "date"); accessData.remove(lastUrl, ACCESSED_KEY); try { URL urlTmp = UrlUtils.normalizeURL(redirect); redirect = urlTmp.toExternalForm(); URIImpl utmp = new URIImpl(redirect); } catch (Exception e) { LOG.warn("Invalid redirect URL: " + e.toString()); continue; } accessData.put(lastUrl, "redirectsTo", redirect); } lastUrl = redirect; } try { uri = new URIImpl(lastUrl); } catch (Exception e) { crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Failed); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, lastUrl, "Invalid URL when redirecting from " + urlString + ": " + e.toString())); return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, e, "invalid URL when redirecting from " + urlString + ", lastUrl=" + lastUrl); } if (urlString.equals(lastUrl)) { crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Failed); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, urlString, "URL redirects to itself")); return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, null, "url redirects to itself: " + urlString); } urlString = lastUrl; } String msg = urlString; if ((redirects != null) && (redirects.length > 0)) { msg = msg + ", originUrl = " + originalUrlString + ", redirects = " + Arrays.toString(redirects); } if (sCode == ProtocolStatus.Code.NOT_FOUND) { crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Not_Found); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, msg, "URL not found:, code = " + sCode)); throw new UrlNotFoundException(urlString); } if (sCode == ProtocolStatus.Code.NOT_MODIFIED) { return null; } if (sCode == ProtocolStatus.Code.ACCESS_DENIED) { crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Access_Denied); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, msg, "Access denied:, code = " + sCode)); return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, null, msg + ", Access denied: code = " + sCode); } if (sCode == ProtocolStatus.Code.ROBOTS_DENIED) { crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Robots_Denied); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, msg, "Robots denied:, code = " + sCode)); return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, null, msg + ", Robots denied: code = " + sCode); } if (sCode != ProtocolStatus.Code.OK) { String message = output.getStatus().message; if ((sCode == ProtocolStatus.Code.EXCEPTION) && (message != null) && ((message.startsWith("java.net.ConnectException")) || (message.startsWith("java.net.UnknownHostException")) || (message .startsWith("java.net.SocketException")))) { crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Failed); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, msg, "Failed accessing URL:" + message + ", code = " + sCode)); return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, null, msg + ", Failed accessing URL:" + message + ", code = " + sCode); } crawlState.getStatus().incrementCounter(CrawlStatus.Counter.Failed); LOG.warn(CrawlerUtils.msgDocFailed(ds.getCollection(), ds, msg, "Http connection error:" + output.getStatus().message + ", code = " + sCode)); return recordErrorObject(uri, source, output, accessData, fetchTime, containerFactory, null, msg + ", Http connection error:" + output.getStatus().message + ", code = " + sCode); } } DataObject result = createDataObject(uri, source, output, containerFactory); updateAccessData(accessData, urlString, fetchTime); return result; }
public void computeEnhancements(ContentItem ci) throws EngineException { // get model from the extraction URIImpl docId; Model m = null; ci.getLock().readLock().lock(); try { docId = new URIImpl(ci.getUri().getUnicodeString()); m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType()); } catch (ExtractorException e) { throw new EngineException("Error while processing ContentItem " + ci.getUri()+" with Metaxa",e); } catch (IOException e) { throw new EngineException("Error while processing ContentItem " + ci.getUri()+" with Metaxa",e); } finally { ci.getLock().readLock().unlock(); } // Convert the RDF2go model to a Clerezza Graph and also extract // the extracted plain text from the model if (null == m) { log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(),ci.getMimeType()); return; } ContentSink plainTextSink; try { plainTextSink = ciFactory.createContentSink("text/plain"); } catch (IOException e) { m.close(); throw new EngineException("Unable to initialise Blob for storing" + "the plain text content",e); } HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>(); RDF2GoUtils.urifyBlankNodes(m); ClosableIterator<Statement> it = m.iterator(); BufferedWriter out = new BufferedWriter(new OutputStreamWriter( plainTextSink.getOutputStream(), UTF8)); boolean textExtracted = false; //used to detect if some text was extracted try { MGraph g = new SimpleMGraph(); //first add to a temporary graph while (it.hasNext()) { Statement oneStmt = it.next(); //we need to treat triples that provide the plain/text //version differently. Such Objects need to be added to //the plain text Blob! if(oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){ String text = oneStmt.getObject().toString(); if(text != null && !text.isEmpty()){ try { out.write(oneStmt.getObject().toString()); } catch (IOException e) { throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass()+")",e); } textExtracted = true; if (includeText) { NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap); UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap); Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap); g.add(new TripleImpl(subject, predicate, object)); } } } else { //add metadata to the metadata of the contentItem NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap); UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap); Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap); if (null != subject && null != predicate && null != object) { Triple t = new TripleImpl(subject, predicate, object); g.add(t); log.debug("added " + t.toString()); } } } //add the extracted triples to the metadata of the ContentItem ci.getLock().writeLock().lock(); try { ci.getMetadata().addAll(g); g = null; } finally { ci.getLock().writeLock().unlock(); } } finally { it.close(); m.close(); IOUtils.closeQuietly(out); } if(textExtracted){ //add plain text to the content item UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID()); ci.addPart(blobUri, plainTextSink.getBlob()); } }
public RepositoryModel openRepositoryModel(URI contextUri) throws RDFRepositoryException { if ( null == sesameRepository ) { throw new RDFRepositoryException("The connector has not been initialized correctly"); } RepositoryModel result = null; if ( contextUri != null && contextUri.toString() != null && contextUri.toString() != "" ) { URIImpl contextUriInRepository = new URIImpl(contextUri.toString()); result = new RepositoryModel(contextUriInRepository, sesameRepository); } else { result = new RepositoryModel(sesameRepository); } if ( null == result ) { throw new RDFRepositoryException("Can not open the repository"); } result.open(); return result; }
@Override public Model next() { /* * Get the complete event (identified by ID) from storage */ this.currentGraphUri = eventIds.next().getValue("g").asURI(); Model model = dataSet.getModel(this.currentGraphUri); /* * Replace the generic TwitterEvent type a specific one per company keyword */ URI eventSubject = new URIImpl(this.currentGraphUri.toString() + Event.EVENT_ID_SUFFIX); if (model.contains(eventSubject, RDF.type, TwitterEvent.RDFS_CLASS)) { model.removeStatements(eventSubject, RDF.type, Variable.ANY); model.addStatement(eventSubject, RDF.type, this.eventType); } /* * Find some special characters in tweet messages */ URIImpl siocContent = new URIImpl(Namespace.SIOC.getUri() + "content"); ClosableIterator<Statement> tweetMessages = model.findStatements(eventSubject, siocContent, Variable.ANY); while (tweetMessages.hasNext()) { Statement tweetMessage = tweetMessages.next(); String oldMessage = tweetMessage.getObject().toString(); System.out.println(oldMessage); // check on unicode level: if (Pattern.compile("\\p{C}").matcher(oldMessage).find()) { System.out.println("Found char 2[] in string " + oldMessage); } // check on ascii level: //oldMessage.getBytes(StandardCharsets.US_ASCII); //String newMessage = //model.removeStatements(tweetMessage); // TODO enable for permanent change on disk //model.addStatement(eventSubject, siocContent, newMessage); } return model; }
Usage snippet has been bookmarked! Review your bookmarks
Thank you! Review your likes