001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 */ 018 019/* 020 * This package is based on the work done by Timothy Gerard Endres 021 * (time@ice.com) to whom the Ant project is very grateful for his great code. 022 */ 023 024package org.apache.commons.compress.archivers.tar; 025 026import java.io.ByteArrayOutputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.HashMap; 030import java.util.Map; 031 032import org.apache.commons.compress.archivers.ArchiveEntry; 033import org.apache.commons.compress.archivers.ArchiveInputStream; 034import org.apache.commons.compress.archivers.zip.ZipEncoding; 035import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 036import org.apache.commons.compress.utils.ArchiveUtils; 037import org.apache.commons.compress.utils.CharsetNames; 038import org.apache.commons.compress.utils.IOUtils; 039 040/** 041 * The TarInputStream reads a UNIX tar archive as an InputStream. 042 * methods are provided to position at each successive entry in 043 * the archive, and the read each entry as a normal input stream 044 * using read(). 045 * @NotThreadSafe 046 */ 047public class TarArchiveInputStream extends ArchiveInputStream { 048 049 private static final int SMALL_BUFFER_SIZE = 256; 050 051 private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE]; 052 053 /** The size the TAR header */ 054 private final int recordSize; 055 056 /** The size of a block */ 057 private final int blockSize; 058 059 /** True if file has hit EOF */ 060 private boolean hasHitEOF; 061 062 /** Size of the current entry */ 063 private long entrySize; 064 065 /** How far into the entry the stream is at */ 066 private long entryOffset; 067 068 /** An input stream to read from */ 069 private final InputStream is; 070 071 /** The meta-data about the current entry */ 072 private TarArchiveEntry currEntry; 073 074 /** The encoding of the file */ 075 private final ZipEncoding zipEncoding; 076 077 // the provided encoding (for unit tests) 078 final String encoding; 079 080 // the global PAX header 081 private Map<String, String> globalPaxHeaders = new HashMap<>(); 082 083 private final boolean lenient; 084 085 /** 086 * Constructor for TarInputStream. 087 * @param is the input stream to use 088 */ 089 public TarArchiveInputStream(final InputStream is) { 090 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE); 091 } 092 093 /** 094 * Constructor for TarInputStream. 095 * @param is the input stream to use 096 * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be 097 * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an 098 * exception instead. 099 * @since 1.19 100 */ 101 public TarArchiveInputStream(final InputStream is, boolean lenient) { 102 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, null, lenient); 103 } 104 105 /** 106 * Constructor for TarInputStream. 107 * @param is the input stream to use 108 * @param encoding name of the encoding to use for file names 109 * @since 1.4 110 */ 111 public TarArchiveInputStream(final InputStream is, final String encoding) { 112 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, 113 encoding); 114 } 115 116 /** 117 * Constructor for TarInputStream. 118 * @param is the input stream to use 119 * @param blockSize the block size to use 120 */ 121 public TarArchiveInputStream(final InputStream is, final int blockSize) { 122 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE); 123 } 124 125 /** 126 * Constructor for TarInputStream. 127 * @param is the input stream to use 128 * @param blockSize the block size to use 129 * @param encoding name of the encoding to use for file names 130 * @since 1.4 131 */ 132 public TarArchiveInputStream(final InputStream is, final int blockSize, 133 final String encoding) { 134 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding); 135 } 136 137 /** 138 * Constructor for TarInputStream. 139 * @param is the input stream to use 140 * @param blockSize the block size to use 141 * @param recordSize the record size to use 142 */ 143 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) { 144 this(is, blockSize, recordSize, null); 145 } 146 147 /** 148 * Constructor for TarInputStream. 149 * @param is the input stream to use 150 * @param blockSize the block size to use 151 * @param recordSize the record size to use 152 * @param encoding name of the encoding to use for file names 153 * @since 1.4 154 */ 155 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, 156 final String encoding) { 157 this(is, blockSize, recordSize, encoding, false); 158 } 159 160 /** 161 * Constructor for TarInputStream. 162 * @param is the input stream to use 163 * @param blockSize the block size to use 164 * @param recordSize the record size to use 165 * @param encoding name of the encoding to use for file names 166 * @param lenient when set to true illegal values for group/userid, mode, device numbers and timestamp will be 167 * ignored and the fields set to {@link TarArchiveEntry#UNKNOWN}. When set to false such illegal fields cause an 168 * exception instead. 169 * @since 1.19 170 */ 171 public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, 172 final String encoding, boolean lenient) { 173 this.is = is; 174 this.hasHitEOF = false; 175 this.encoding = encoding; 176 this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); 177 this.recordSize = recordSize; 178 this.blockSize = blockSize; 179 this.lenient = lenient; 180 } 181 182 /** 183 * Closes this stream. Calls the TarBuffer's close() method. 184 * @throws IOException on error 185 */ 186 @Override 187 public void close() throws IOException { 188 is.close(); 189 } 190 191 /** 192 * Get the record size being used by this stream's buffer. 193 * 194 * @return The TarBuffer record size. 195 */ 196 public int getRecordSize() { 197 return recordSize; 198 } 199 200 /** 201 * Get the available data that can be read from the current 202 * entry in the archive. This does not indicate how much data 203 * is left in the entire archive, only in the current entry. 204 * This value is determined from the entry's size header field 205 * and the amount of data already read from the current entry. 206 * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE 207 * bytes are left in the current entry in the archive. 208 * 209 * @return The number of available bytes for the current entry. 210 * @throws IOException for signature 211 */ 212 @Override 213 public int available() throws IOException { 214 if (isDirectory()) { 215 return 0; 216 } 217 if (entrySize - entryOffset > Integer.MAX_VALUE) { 218 return Integer.MAX_VALUE; 219 } 220 return (int) (entrySize - entryOffset); 221 } 222 223 224 /** 225 * Skips over and discards <code>n</code> bytes of data from this input 226 * stream. The <code>skip</code> method may, for a variety of reasons, end 227 * up skipping over some smaller number of bytes, possibly <code>0</code>. 228 * This may result from any of a number of conditions; reaching end of file 229 * or end of entry before <code>n</code> bytes have been skipped; are only 230 * two possibilities. The actual number of bytes skipped is returned. If 231 * <code>n</code> is negative, no bytes are skipped. 232 * 233 * 234 * @param n 235 * the number of bytes to be skipped. 236 * @return the actual number of bytes skipped. 237 * @throws IOException 238 * if some other I/O error occurs. 239 */ 240 @Override 241 public long skip(final long n) throws IOException { 242 if (n <= 0 || isDirectory()) { 243 return 0; 244 } 245 246 final long available = entrySize - entryOffset; 247 final long skipped = IOUtils.skip(is, Math.min(n, available)); 248 count(skipped); 249 entryOffset += skipped; 250 return skipped; 251 } 252 253 /** 254 * Since we do not support marking just yet, we return false. 255 * 256 * @return False. 257 */ 258 @Override 259 public boolean markSupported() { 260 return false; 261 } 262 263 /** 264 * Since we do not support marking just yet, we do nothing. 265 * 266 * @param markLimit The limit to mark. 267 */ 268 @Override 269 public void mark(final int markLimit) { 270 } 271 272 /** 273 * Since we do not support marking just yet, we do nothing. 274 */ 275 @Override 276 public synchronized void reset() { 277 } 278 279 /** 280 * Get the next entry in this tar archive. This will skip 281 * over any remaining data in the current entry, if there 282 * is one, and place the input stream at the header of the 283 * next entry, and read the header and instantiate a new 284 * TarEntry from the header bytes and return that entry. 285 * If there are no more entries in the archive, null will 286 * be returned to indicate that the end of the archive has 287 * been reached. 288 * 289 * @return The next TarEntry in the archive, or null. 290 * @throws IOException on error 291 */ 292 public TarArchiveEntry getNextTarEntry() throws IOException { 293 if (isAtEOF()) { 294 return null; 295 } 296 297 if (currEntry != null) { 298 /* Skip will only go to the end of the current entry */ 299 IOUtils.skip(this, Long.MAX_VALUE); 300 301 /* skip to the end of the last record */ 302 skipRecordPadding(); 303 } 304 305 final byte[] headerBuf = getRecord(); 306 307 if (headerBuf == null) { 308 /* hit EOF */ 309 currEntry = null; 310 return null; 311 } 312 313 try { 314 currEntry = new TarArchiveEntry(headerBuf, zipEncoding, lenient); 315 } catch (final IllegalArgumentException e) { 316 throw new IOException("Error detected parsing the header", e); 317 } 318 319 entryOffset = 0; 320 entrySize = currEntry.getSize(); 321 322 if (currEntry.isGNULongLinkEntry()) { 323 final byte[] longLinkData = getLongNameData(); 324 if (longLinkData == null) { 325 // Bugzilla: 40334 326 // Malformed tar file - long link entry name not followed by 327 // entry 328 return null; 329 } 330 currEntry.setLinkName(zipEncoding.decode(longLinkData)); 331 } 332 333 if (currEntry.isGNULongNameEntry()) { 334 final byte[] longNameData = getLongNameData(); 335 if (longNameData == null) { 336 // Bugzilla: 40334 337 // Malformed tar file - long entry name not followed by 338 // entry 339 return null; 340 } 341 currEntry.setName(zipEncoding.decode(longNameData)); 342 } 343 344 if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers 345 readGlobalPaxHeaders(); 346 } 347 348 if (currEntry.isPaxHeader()){ // Process Pax headers 349 paxHeaders(); 350 } else if (!globalPaxHeaders.isEmpty()) { 351 applyPaxHeadersToCurrentEntry(globalPaxHeaders); 352 } 353 354 if (currEntry.isOldGNUSparse()){ // Process sparse files 355 readOldGNUSparse(); 356 } 357 358 // If the size of the next element in the archive has changed 359 // due to a new size being reported in the posix header 360 // information, we update entrySize here so that it contains 361 // the correct value. 362 entrySize = currEntry.getSize(); 363 364 return currEntry; 365 } 366 367 /** 368 * The last record block should be written at the full size, so skip any 369 * additional space used to fill a record after an entry 370 */ 371 private void skipRecordPadding() throws IOException { 372 if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) { 373 final long numRecords = (this.entrySize / this.recordSize) + 1; 374 final long padding = (numRecords * this.recordSize) - this.entrySize; 375 final long skipped = IOUtils.skip(is, padding); 376 count(skipped); 377 } 378 } 379 380 /** 381 * Get the next entry in this tar archive as longname data. 382 * 383 * @return The next entry in the archive as longname data, or null. 384 * @throws IOException on error 385 */ 386 protected byte[] getLongNameData() throws IOException { 387 // read in the name 388 final ByteArrayOutputStream longName = new ByteArrayOutputStream(); 389 int length = 0; 390 while ((length = read(smallBuf)) >= 0) { 391 longName.write(smallBuf, 0, length); 392 } 393 getNextEntry(); 394 if (currEntry == null) { 395 // Bugzilla: 40334 396 // Malformed tar file - long entry name not followed by entry 397 return null; 398 } 399 byte[] longNameData = longName.toByteArray(); 400 // remove trailing null terminator(s) 401 length = longNameData.length; 402 while (length > 0 && longNameData[length - 1] == 0) { 403 --length; 404 } 405 if (length != longNameData.length) { 406 final byte[] l = new byte[length]; 407 System.arraycopy(longNameData, 0, l, 0, length); 408 longNameData = l; 409 } 410 return longNameData; 411 } 412 413 /** 414 * Get the next record in this tar archive. This will skip 415 * over any remaining data in the current entry, if there 416 * is one, and place the input stream at the header of the 417 * next entry. 418 * 419 * <p>If there are no more entries in the archive, null will be 420 * returned to indicate that the end of the archive has been 421 * reached. At the same time the {@code hasHitEOF} marker will be 422 * set to true.</p> 423 * 424 * @return The next header in the archive, or null. 425 * @throws IOException on error 426 */ 427 private byte[] getRecord() throws IOException { 428 byte[] headerBuf = readRecord(); 429 setAtEOF(isEOFRecord(headerBuf)); 430 if (isAtEOF() && headerBuf != null) { 431 tryToConsumeSecondEOFRecord(); 432 consumeRemainderOfLastBlock(); 433 headerBuf = null; 434 } 435 return headerBuf; 436 } 437 438 /** 439 * Determine if an archive record indicate End of Archive. End of 440 * archive is indicated by a record that consists entirely of null bytes. 441 * 442 * @param record The record data to check. 443 * @return true if the record data is an End of Archive 444 */ 445 protected boolean isEOFRecord(final byte[] record) { 446 return record == null || ArchiveUtils.isArrayZero(record, recordSize); 447 } 448 449 /** 450 * Read a record from the input stream and return the data. 451 * 452 * @return The record data or null if EOF has been hit. 453 * @throws IOException on error 454 */ 455 protected byte[] readRecord() throws IOException { 456 457 final byte[] record = new byte[recordSize]; 458 459 final int readNow = IOUtils.readFully(is, record); 460 count(readNow); 461 if (readNow != recordSize) { 462 return null; 463 } 464 465 return record; 466 } 467 468 private void readGlobalPaxHeaders() throws IOException { 469 globalPaxHeaders = parsePaxHeaders(this); 470 getNextEntry(); // Get the actual file entry 471 } 472 473 private void paxHeaders() throws IOException{ 474 final Map<String, String> headers = parsePaxHeaders(this); 475 getNextEntry(); // Get the actual file entry 476 applyPaxHeadersToCurrentEntry(headers); 477 } 478 479 // NOTE, using a Map here makes it impossible to ever support GNU 480 // sparse files using the PAX Format 0.0, see 481 // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188 482 Map<String, String> parsePaxHeaders(final InputStream i) 483 throws IOException { 484 final Map<String, String> headers = new HashMap<>(globalPaxHeaders); 485 // Format is "length keyword=value\n"; 486 while(true){ // get length 487 int ch; 488 int len = 0; 489 int read = 0; 490 while((ch = i.read()) != -1) { 491 read++; 492 if (ch == '\n') { // blank line in header 493 break; 494 } else if (ch == ' '){ // End of length string 495 // Get keyword 496 final ByteArrayOutputStream coll = new ByteArrayOutputStream(); 497 while((ch = i.read()) != -1) { 498 read++; 499 if (ch == '='){ // end of keyword 500 final String keyword = coll.toString(CharsetNames.UTF_8); 501 // Get rest of entry 502 final int restLen = len - read; 503 if (restLen == 1) { // only NL 504 headers.remove(keyword); 505 } else { 506 final byte[] rest = new byte[restLen]; 507 final int got = IOUtils.readFully(i, rest); 508 if (got != restLen) { 509 throw new IOException("Failed to read " 510 + "Paxheader. Expected " 511 + restLen 512 + " bytes, read " 513 + got); 514 } 515 // Drop trailing NL 516 final String value = new String(rest, 0, 517 restLen - 1, CharsetNames.UTF_8); 518 headers.put(keyword, value); 519 } 520 break; 521 } 522 coll.write((byte) ch); 523 } 524 break; // Processed single header 525 } 526 len *= 10; 527 len += ch - '0'; 528 } 529 if (ch == -1){ // EOF 530 break; 531 } 532 } 533 return headers; 534 } 535 536 private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers) { 537 currEntry.updateEntryFromPaxHeaders(headers); 538 539 } 540 541 /** 542 * Adds the sparse chunks from the current entry to the sparse chunks, 543 * including any additional sparse entries following the current entry. 544 * 545 * @throws IOException on error 546 * 547 * @todo Sparse files get not yet really processed. 548 */ 549 private void readOldGNUSparse() throws IOException { 550 /* we do not really process sparse files yet 551 sparses = new ArrayList(); 552 sparses.addAll(currEntry.getSparses()); 553 */ 554 if (currEntry.isExtended()) { 555 TarArchiveSparseEntry entry; 556 do { 557 final byte[] headerBuf = getRecord(); 558 if (headerBuf == null) { 559 currEntry = null; 560 break; 561 } 562 entry = new TarArchiveSparseEntry(headerBuf); 563 /* we do not really process sparse files yet 564 sparses.addAll(entry.getSparses()); 565 */ 566 } while (entry.isExtended()); 567 } 568 } 569 570 private boolean isDirectory() { 571 return currEntry != null && currEntry.isDirectory(); 572 } 573 574 /** 575 * Returns the next Archive Entry in this Stream. 576 * 577 * @return the next entry, 578 * or {@code null} if there are no more entries 579 * @throws IOException if the next entry could not be read 580 */ 581 @Override 582 public ArchiveEntry getNextEntry() throws IOException { 583 return getNextTarEntry(); 584 } 585 586 /** 587 * Tries to read the next record rewinding the stream if it is not a EOF record. 588 * 589 * <p>This is meant to protect against cases where a tar 590 * implementation has written only one EOF record when two are 591 * expected. Actually this won't help since a non-conforming 592 * implementation likely won't fill full blocks consisting of - by 593 * default - ten records either so we probably have already read 594 * beyond the archive anyway.</p> 595 */ 596 private void tryToConsumeSecondEOFRecord() throws IOException { 597 boolean shouldReset = true; 598 final boolean marked = is.markSupported(); 599 if (marked) { 600 is.mark(recordSize); 601 } 602 try { 603 shouldReset = !isEOFRecord(readRecord()); 604 } finally { 605 if (shouldReset && marked) { 606 pushedBackBytes(recordSize); 607 is.reset(); 608 } 609 } 610 } 611 612 /** 613 * Reads bytes from the current tar archive entry. 614 * 615 * This method is aware of the boundaries of the current 616 * entry in the archive and will deal with them as if they 617 * were this stream's start and EOF. 618 * 619 * @param buf The buffer into which to place bytes read. 620 * @param offset The offset at which to place bytes read. 621 * @param numToRead The number of bytes to read. 622 * @return The number of bytes read, or -1 at EOF. 623 * @throws IOException on error 624 */ 625 @Override 626 public int read(final byte[] buf, final int offset, int numToRead) throws IOException { 627 int totalRead = 0; 628 629 if (isAtEOF() || isDirectory() || entryOffset >= entrySize) { 630 return -1; 631 } 632 633 if (currEntry == null) { 634 throw new IllegalStateException("No current tar entry"); 635 } 636 637 numToRead = Math.min(numToRead, available()); 638 639 totalRead = is.read(buf, offset, numToRead); 640 641 if (totalRead == -1) { 642 if (numToRead > 0) { 643 throw new IOException("Truncated TAR archive"); 644 } 645 setAtEOF(true); 646 } else { 647 count(totalRead); 648 entryOffset += totalRead; 649 } 650 651 return totalRead; 652 } 653 654 /** 655 * Whether this class is able to read the given entry. 656 * 657 * <p>May return false if the current entry is a sparse file.</p> 658 */ 659 @Override 660 public boolean canReadEntryData(final ArchiveEntry ae) { 661 if (ae instanceof TarArchiveEntry) { 662 final TarArchiveEntry te = (TarArchiveEntry) ae; 663 return !te.isSparse(); 664 } 665 return false; 666 } 667 668 /** 669 * Get the current TAR Archive Entry that this input stream is processing 670 * 671 * @return The current Archive Entry 672 */ 673 public TarArchiveEntry getCurrentEntry() { 674 return currEntry; 675 } 676 677 protected final void setCurrentEntry(final TarArchiveEntry e) { 678 currEntry = e; 679 } 680 681 protected final boolean isAtEOF() { 682 return hasHitEOF; 683 } 684 685 protected final void setAtEOF(final boolean b) { 686 hasHitEOF = b; 687 } 688 689 /** 690 * This method is invoked once the end of the archive is hit, it 691 * tries to consume the remaining bytes under the assumption that 692 * the tool creating this archive has padded the last block. 693 */ 694 private void consumeRemainderOfLastBlock() throws IOException { 695 final long bytesReadOfLastBlock = getBytesRead() % blockSize; 696 if (bytesReadOfLastBlock > 0) { 697 final long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock); 698 count(skipped); 699 } 700 } 701 702 /** 703 * Checks if the signature matches what is expected for a tar file. 704 * 705 * @param signature 706 * the bytes to check 707 * @param length 708 * the number of bytes to check 709 * @return true, if this stream is a tar archive stream, false otherwise 710 */ 711 public static boolean matches(final byte[] signature, final int length) { 712 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) { 713 return false; 714 } 715 716 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, 717 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 718 && 719 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, 720 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 721 ){ 722 return true; 723 } 724 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, 725 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 726 && 727 ( 728 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, 729 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 730 || 731 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, 732 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 733 ) 734 ){ 735 return true; 736 } 737 // COMPRESS-107 - recognise Ant tar files 738 return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, 739 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 740 && 741 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, 742 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN); 743 } 744 745}